feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据

- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py)，覆盖7个行业、180家企业、2600名员工 - 重构 config.py，更新特征字段为中文名称，调整目标列、员工ID、行业类型等配置 - 重构 clustering.py，简化聚类逻辑，更新聚类特征和群体命名（高压通勤型、健康波动型等） - 重构 feature_mining.py，更新相关性分析和群体比较维度（按行业、班次、婚姻状态等） - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数（model: random_forest, dimension: industry） - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00
parent a39d8b2fd2
commit e63267cef6
39 changed files with 15731 additions and 5648 deletions
@@ -8,7 +8,7 @@ analysis_bp = Blueprint('analysis', __name__, url_prefix='/api/analysis')
@analysis_bp.route('/importance', methods=['GET'])
 def get_importance():
    try:
-        model_type = request.args.get('model', 'rf')
+        model_type = request.args.get('model', 'random_forest')
        result = analysis_service.get_feature_importance(model_type)
        return jsonify({
            'code': 200,
@@ -43,7 +43,7 @@ def get_correlation():
@analysis_bp.route('/compare', methods=['GET'])
 def get_compare():
    try:
-        dimension = request.args.get('dimension', 'drinker')
+        dimension = request.args.get('dimension', 'industry')
        result = analysis_service.get_group_comparison(dimension)
        return jsonify({
            'code': 200,
@@ -49,8 +49,8 @@ def get_profile():
 def get_scatter():
    try:
        n_clusters = request.args.get('n_clusters', 3, type=int)
-        x_axis = request.args.get('x_axis', 'Age')
-        y_axis = request.args.get('y_axis', 'Absenteeism time in hours')
+        x_axis = request.args.get('x_axis', '月均加班时长')
+        y_axis = request.args.get('y_axis', '缺勤时长（小时）')
        
        n_clusters = max(2, min(10, n_clusters))
        
@@ -15,7 +15,7 @@ def create_app():
    def index():
        return {
            'code': 200,
-            'message': 'Employee Absenteeism Analysis System API',
+            'message': 'China Enterprise Absence Analysis System API',
            'data': {
                'version': '1.0.0',
                'endpoints': {
@@ -1,14 +1,15 @@
 import os

+
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))

 DATA_DIR = os.path.join(BASE_DIR, 'data')
 RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
 PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
-
 MODELS_DIR = os.path.join(BASE_DIR, 'models')

-RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'Absenteeism_at_work.csv')
+RAW_DATA_FILENAME = 'china_enterprise_absence_events.csv'
+RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, RAW_DATA_FILENAME)
 CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'clean_data.csv')

 RF_MODEL_PATH = os.path.join(MODELS_DIR, 'rf_model.pkl')
@@ -17,132 +18,127 @@ KMEANS_MODEL_PATH = os.path.join(MODELS_DIR, 'kmeans_model.pkl')
 SCALER_PATH = os.path.join(MODELS_DIR, 'scaler.pkl')
 ENCODER_PATH = os.path.join(MODELS_DIR, 'encoder.pkl')

-CSV_SEPARATOR = ';'
-
+CSV_SEPARATOR = ','
 RANDOM_STATE = 42
 TEST_SIZE = 0.2

-FEATURE_NAMES = [
-    'ID',
-    'Reason for absence',
-    'Month of absence',
-    'Day of the week',
-    'Seasons',
-    'Transportation expense',
-    'Distance from Residence to Work',
-    'Service time',
-    'Age',
-    'Work load Average/day ',
-    'Hit target',
-    'Disciplinary failure',
-    'Education',
-    'Son',
-    'Social drinker',
-    'Social smoker',
-    'Pet',
-    'Weight',
-    'Height',
-    'Body mass index',
-    'Absenteeism time in hours'
-]
-
-CATEGORICAL_FEATURES = [
-    'Reason for absence',
-    'Month of absence',
-    'Day of the week',
-    'Seasons',
-    'Disciplinary failure',
-    'Education',
-    'Social drinker',
-    'Social smoker'
-]
-
-NUMERICAL_FEATURES = [
-    'Transportation expense',
-    'Distance from Residence to Work',
-    'Service time',
-    'Age',
-    'Work load Average/day ',
-    'Hit target',
-    'Son',
-    'Pet',
-    'Body mass index'
-]
-
-REASON_NAMES = {
-    0: '未知原因',
-    1: '传染病',
-    2: '肿瘤',
-    3: '血液疾病',
-    4: '内分泌疾病',
-    5: '精神行为障碍',
-    6: '神经系统疾病',
-    7: '眼部疾病',
-    8: '耳部疾病',
-    9: '循环系统疾病',
-    10: '呼吸系统疾病',
-    11: '消化系统疾病',
-    12: '皮肤疾病',
-    13: '肌肉骨骼疾病',
-    14: '泌尿生殖疾病',
-    15: '妊娠相关',
-    16: '围产期疾病',
-    17: '先天性畸形',
-    18: '症状体征',
-    19: '损伤中毒',
-    20: '外部原因',
-    21: '健康因素',
-    22: '医疗随访',
-    23: '医疗咨询',
-    24: '献血',
-    25: '实验室检查',
-    26: '无故缺勤',
-    27: '理疗',
-    28: '牙科咨询'
-}
+TARGET_COLUMN = '缺勤时长（小时）'
+EMPLOYEE_ID_COLUMN = '员工编号'
+COMPANY_ID_COLUMN = '企业编号'

 WEEKDAY_NAMES = {
-    2: '周一',
-    3: '周二',
-    4: '周三',
-    5: '周四',
-    6: '周五'
+    1: '周一',
+    2: '周二',
+    3: '周三',
+    4: '周四',
+    5: '周五',
+    6: '周六',
+    7: '周日',
 }

 SEASON_NAMES = {
-    1: '夏季',
-    2: '秋季',
-    3: '冬季',
-    4: '春季'
+    1: '冬季',
+    2: '春季',
+    3: '夏季',
+    4: '秋季',
 }

-EDUCATION_NAMES = {
-    1: '高中',
-    2: '本科',
-    3: '研究生',
-    4: '博士'
-}
+INDUSTRY_NAMES = [
+    '制造业',
+    '互联网',
+    '零售连锁',
+    '物流运输',
+    '金融服务',
+    '医药健康',
+    '建筑工程',
+]
+
+LEAVE_TYPE_NAMES = [
+    '病假',
+    '事假',
+    '年假',
+    '调休',
+    '婚假',
+    '丧假',
+    '产检育儿假',
+    '工伤假',
+    '其他',
+]
+
+REASON_CATEGORY_NAMES = [
+    '身体不适',
+    '家庭事务',
+    '子女照护',
+    '交通受阻',
+    '突发事件',
+    '职业疲劳',
+    '就医复查',
+]

 FEATURE_NAME_CN = {
-    'ID': '员工标识',
-    'Reason for absence': '缺勤原因',
-    'Month of absence': '缺勤月份',
-    'Day of the week': '星期几',
-    'Seasons': '季节',
-    'Transportation expense': '交通费用',
-    'Distance from Residence to Work': '通勤距离',
-    'Service time': '工龄',
-    'Age': '年龄',
-    'Work load Average/day ': '日均工作负荷',
-    'Hit target': '达标率',
-    'Disciplinary failure': '违纪记录',
-    'Education': '学历',
-    'Son': '子女数量',
-    'Social drinker': '饮酒习惯',
-    'Social smoker': '吸烟习惯',
-    'Pet': '宠物数量',
-    'Weight': '体重',
-    'Height': '身高',
-    'Body mass index': 'BMI指数',
-    'Absenteeism time in hours': '缺勤时长'
+    '企业编号': '企业编号',
+    '所属行业': '所属行业',
+    '企业规模': '企业规模',
+    '所在城市等级': '所在城市等级',
+    '用工类型': '用工类型',
+    '部门条线': '部门条线',
+    '岗位序列': '岗位序列',
+    '岗位级别': '岗位级别',
+    '员工编号': '员工编号',
+    '性别': '性别',
+    '年龄': '年龄',
+    '司龄年数': '司龄年数',
+    '最高学历': '最高学历',
+    '婚姻状态': '婚姻状态',
+    '是否本地户籍': '是否本地户籍',
+    '子女数量': '子女数量',
+    '是否独生子女家庭负担': '独生子女家庭负担',
+    '居住类型': '居住类型',
+    '班次类型': '班次类型',
+    '是否夜班岗位': '是否夜班岗位',
+    '月均加班时长': '月均加班时长',
+    '近30天出勤天数': '近30天出勤天数',
+    '近90天缺勤次数': '近90天缺勤次数',
+    '近180天请假总时长': '近180天请假总时长',
+    '通勤时长分钟': '通勤时长分钟',
+    '通勤距离公里': '通勤距离公里',
+    '是否跨城通勤': '是否跨城通勤',
+    '绩效等级': '绩效等级',
+    '近12月违纪次数': '近12月违纪次数',
+    '团队人数': '团队人数',
+    '直属上级管理跨度': '直属上级管理跨度',
+    'BMI': 'BMI',
+    '是否慢性病史': '是否慢性病史',
+    '年度体检异常标记': '年度体检异常',
+    '近30天睡眠时长均值': '睡眠时长',
+    '每周运动频次': '运动频次',
+    '是否吸烟': '是否吸烟',
+    '是否饮酒': '是否饮酒',
+    '心理压力等级': '心理压力等级',
+    '是否长期久坐岗位': '是否久坐岗位',
+    '缺勤月份': '缺勤月份',
+    '星期几': '星期几',
+    '是否节假日前后': '节假日前后',
+    '季节': '季节',
+    '请假申请渠道': '请假申请渠道',
+    '请假类型': '请假类型',
+    '请假原因大类': '请假原因大类',
+    '是否提供医院证明': '医院证明',
+    '是否临时请假': '临时请假',
+    '是否连续缺勤': '连续缺勤',
+    '前一工作日是否加班': '前一工作日加班',
+    '缺勤时长（小时）': '缺勤时长',
+    '加班通勤压力指数': '加班通勤压力指数',
+    '家庭负担指数': '家庭负担指数',
+    '健康风险指数': '健康风险指数',
+    '岗位稳定性指数': '岗位稳定性指数',
+    '节假日风险标记': '节假日风险标记',
+    '排班压力标记': '排班压力标记',
+    '缺勤历史强度': '缺勤历史强度',
+    '生活规律指数': '生活规律指数',
+    '管理负荷指数': '管理负荷指数',
+    '工龄分层': '工龄分层',
+    '年龄分层': '年龄分层',
+    '通勤分层': '通勤分层',
+    '加班分层': '加班分层',
 }
@@ -1,9 +1,6 @@
-import pandas as pd
 import numpy as np
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import MinMaxScaler
-import joblib
-import os

 import config
 from core.preprocessing import get_clean_data
@@ -14,216 +11,123 @@ class KMeansAnalyzer:
        self.n_clusters = n_clusters
        self.model = None
        self.scaler = MinMaxScaler()
-        self.data = None
-        self.data_scaled = None
        self.labels = None
-        
-    def _get_feature_columns(self, df):
-        df.columns = [col.strip() for col in df.columns]
-        
-        feature_map = {
-            'Age': None,
-            'Service time': None,
-            'Work load Average/day': None,
-            'Body mass index': None,
-            'Absenteeism time in hours': None
-        }
-        
-        for key in feature_map:
-            if key in df.columns:
-                feature_map[key] = key
-            else:
-                for col in df.columns:
-                    if key.replace(' ', '').lower() == col.replace(' ', '').lower():
-                        feature_map[key] = col
-                        break
-        
-        actual_features = [v for v in feature_map.values() if v is not None]
-        return actual_features
-    
+        self.feature_cols = [
+            '年龄',
+            '司龄年数',
+            '月均加班时长',
+            '通勤时长分钟',
+            'BMI',
+            '缺勤时长（小时）',
+        ]
+
    def fit(self, n_clusters=None):
        if n_clusters:
            self.n_clusters = n_clusters
-        
-        df = get_clean_data()
-        df = df.reset_index(drop=True)
-        
-        feature_cols = self._get_feature_columns(df)
-        
-        if not feature_cols:
-            feature_cols = ['Age', 'Service time', 'Body mass index', 'Absenteeism time in hours']
-            feature_cols = [c for c in feature_cols if c in df.columns]
-        
-        self.data = df[feature_cols].values
-        
-        self.scaler = MinMaxScaler()
-        self.data_scaled = self.scaler.fit_transform(self.data)
-        
-        self.model = KMeans(
-            n_clusters=self.n_clusters,
-            random_state=config.RANDOM_STATE,
-            n_init=10
-        )
-        
-        self.labels = self.model.fit_predict(self.data_scaled)
-        
+        df = get_clean_data().reset_index(drop=True)
+        data = df[self.feature_cols].values
+        data_scaled = self.scaler.fit_transform(data)
+        self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10)
+        self.labels = self.model.fit_predict(data_scaled)
        return self.model
-    
+
    def get_cluster_results(self, n_clusters=3):
        if self.model is None or self.n_clusters != n_clusters:
            self.fit(n_clusters)
-        
        centers = self.scaler.inverse_transform(self.model.cluster_centers_)
-        
        unique, counts = np.unique(self.labels, return_counts=True)
        total = len(self.labels)
-        
-        cluster_names = self._generate_cluster_names(centers)
-        
-        feature_cols = self._get_feature_columns(get_clean_data())
-        
+        names = self._generate_cluster_names(centers)
        clusters = []
-        for i, (cluster_id, count) in enumerate(zip(unique, counts)):
-            center_dict = {}
-            for j, fname in enumerate(feature_cols):
-                if j < len(centers[i]):
-                    center_dict[fname] = round(centers[i][j], 2)
-            
+        for cluster_id, count in zip(unique, counts):
+            center = centers[int(cluster_id)]
            clusters.append({
                'id': int(cluster_id),
-                'name': cluster_names.get(cluster_id, f'群体{cluster_id+1}'),
+                'name': names.get(int(cluster_id), f'群体{int(cluster_id) + 1}'),
                'member_count': int(count),
                'percentage': round(count / total * 100, 1),
-                'center': center_dict,
-                'description': self._generate_description(cluster_names.get(cluster_id, ''))
+                'center': {
+                    feature: round(float(value), 2)
+                    for feature, value in zip(self.feature_cols, center)
+                },
+                'description': self._generate_description(names.get(int(cluster_id), '')),
            })
-        
-        return {
-            'n_clusters': self.n_clusters,
-            'clusters': clusters
-        }
-    
+        return {'n_clusters': self.n_clusters, 'clusters': clusters}
+
    def get_cluster_profile(self, n_clusters=3):
        if self.model is None or self.n_clusters != n_clusters:
            self.fit(n_clusters)
-        
        centers_scaled = self.model.cluster_centers_
-        
-        df = get_clean_data()
-        df.columns = [col.strip() for col in df.columns]
-        feature_cols = self._get_feature_columns(df)
-        
-        dimensions = ['年龄', '工龄', '工作负荷', 'BMI', '缺勤倾向'][:len(feature_cols)]
-        
-        cluster_names = self._generate_cluster_names(
-            self.scaler.inverse_transform(centers_scaled)
-        )
-        
-        clusters = []
-        for i in range(self.n_clusters):
-            clusters.append({
-                'id': i,
-                'name': cluster_names.get(i, f'群体{i+1}'),
-                'values': [round(v, 2) for v in centers_scaled[i]]
-            })
-        
+        names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled))
        return {
-            'dimensions': dimensions,
-            'dimension_keys': feature_cols,
-            'clusters': clusters
+            'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'],
+            'dimension_keys': self.feature_cols,
+            'clusters': [
+                {
+                    'id': idx,
+                    'name': names.get(idx, f'群体{idx + 1}'),
+                    'values': [round(float(v), 2) for v in centers_scaled[idx]],
+                }
+                for idx in range(self.n_clusters)
+            ],
        }
-    
-    def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
+
+    def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长（小时）'):
        if self.model is None or self.n_clusters != n_clusters:
            self.fit(n_clusters)
-        
-        df = get_clean_data()
-        df = df.reset_index(drop=True)
-        df.columns = [col.strip() for col in df.columns]
-        
-        x_col = None
-        y_col = None
-        
-        for col in df.columns:
-            if x_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
-                x_col = col
-            if y_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
-                y_col = col
-        
-        if x_col is None:
-            x_col = df.columns[0]
-        if y_col is None:
-            y_col = df.columns[-1]
-        
+        df = get_clean_data().reset_index(drop=True)
+        if x_axis not in df.columns:
+            x_axis = '月均加班时长'
+        if y_axis not in df.columns:
+            y_axis = config.TARGET_COLUMN
        points = []
        for idx in range(min(len(df), len(self.labels))):
            row = df.iloc[idx]
            points.append({
-                'employee_id': int(row['ID']),
-                'x': float(row[x_col]),
-                'y': float(row[y_col]),
-                'cluster_id': int(self.labels[idx])
+                'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]),
+                'x': float(row[x_axis]),
+                'y': float(row[y_axis]),
+                'cluster_id': int(self.labels[idx]),
            })
-        
-        cluster_colors = {
-            '0': '#67C23A',
-            '1': '#E6A23C',
-            '2': '#F56C6C',
-            '3': '#909399',
-            '4': '#409EFF'
-        }
-        
        return {
-            'x_axis': x_col,
-            'x_axis_name': config.FEATURE_NAME_CN.get(x_col, x_col),
-            'y_axis': y_col,
-            'y_axis_name': config.FEATURE_NAME_CN.get(y_col, y_col),
+            'x_axis': x_axis,
+            'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis),
+            'y_axis': y_axis,
+            'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis),
            'points': points[:500],
-            'cluster_colors': cluster_colors
+            'cluster_colors': {
+                '0': '#5B8FF9',
+                '1': '#61DDAA',
+                '2': '#F6BD16',
+                '3': '#E8684A',
+                '4': '#6DC8EC',
+            },
        }
-    
+
    def _generate_cluster_names(self, centers):
        names = {}
-        
-        for i, center in enumerate(centers):
-            if len(center) >= 5:
-                service_time = center[1]
-                work_load = center[2]
-                bmi = center[3]
-                absent = center[4]
+        for idx, center in enumerate(centers):
+            _, tenure, overtime, commute, bmi, absence = center
+            if overtime > 38 and commute > 55 and absence > 8:
+                names[idx] = '高压通勤型'
+            elif bmi > 27 and absence > 8:
+                names[idx] = '健康波动型'
+            elif tenure > 8 and absence < 6:
+                names[idx] = '稳定低风险型'
+            elif overtime > 28 and absence > 7:
+                names[idx] = '轮班负荷型'
            else:
-                service_time = center[1] if len(center) > 1 else 0
-                work_load = 0
-                bmi = center[2] if len(center) > 2 else 0
-                absent = center[3] if len(center) > 3 else 0
-            
-            if service_time > 15 and absent < 3:
-                names[i] = '模范型员工'
-            elif work_load > 260 and absent > 5:
-                names[i] = '压力型员工'
-            elif bmi > 28:
-                names[i] = '生活习惯型员工'
-            else:
-                names[i] = f'群体{i+1}'
-        
+                names[idx] = f'群体{idx + 1}'
        return names
-    
+
    def _generate_description(self, name):
        descriptions = {
-            '模范型员工': '工龄长、工作稳定、缺勤率低',
-            '压力型员工': '工作负荷大、缺勤较多',
-            '生活习惯型员工': 'BMI偏高、需关注健康'
+            '高压通勤型': '加班和通勤压力都高，缺勤时长偏长。',
+            '健康波动型': '健康相关风险更高，需要重点关注。',
+            '稳定低风险型': '司龄较长，缺勤水平稳定且偏低。',
+            '轮班负荷型': '排班和工作负荷较重，缺勤风险较高。',
        }
-        return descriptions.get(name, '常规员工群体')
-    
-    def save_model(self):
-        os.makedirs(config.MODELS_DIR, exist_ok=True)
-        joblib.dump(self.model, config.KMEANS_MODEL_PATH)
-    
-    def load_model(self):
-        if os.path.exists(config.KMEANS_MODEL_PATH):
-            self.model = joblib.load(config.KMEANS_MODEL_PATH)
-            self.n_clusters = self.model.n_clusters
+        return descriptions.get(name, '常规员工群体。')


 kmeans_analyzer = KMeansAnalyzer()
@@ -1,4 +1,3 @@
-import pandas as pd
 import numpy as np

 import config
@@ -7,145 +6,67 @@ from core.preprocessing import get_clean_data

 def calculate_correlation():
    df = get_clean_data()
-    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    
-    if 'ID' in numeric_cols:
-        numeric_cols.remove('ID')
-    
-    corr_matrix = df[numeric_cols].corr()
-    
-    return corr_matrix
+    for candidate in [config.EMPLOYEE_ID_COLUMN]:
+        if candidate in numeric_cols:
+            numeric_cols.remove(candidate)
+    return df[numeric_cols].corr()


 def get_correlation_for_heatmap():
    corr_matrix = calculate_correlation()
-    
    key_features = [
-        'Age',
-        'Service time',
-        'Distance from Residence to Work',
-        'Work load Average/day ',
-        'Body mass index',
-        'Absenteeism time in hours'
+        '月均加班时长',
+        '通勤时长分钟',
+        '近90天缺勤次数',
+        'BMI',
+        '近30天睡眠时长均值',
+        '缺勤时长（小时）',
    ]
-    
    key_features = [f for f in key_features if f in corr_matrix.columns]
-    
    sub_matrix = corr_matrix.loc[key_features, key_features]
-    
-    result = {
+    return {
        'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features],
-        'matrix': sub_matrix.values.round(2).tolist()
+        'matrix': sub_matrix.values.round(2).tolist(),
    }
-    
-    return result
-
-
-def calculate_feature_importance(model, feature_names):
-    if hasattr(model, 'feature_importances_'):
-        importance = model.feature_importances_
-    else:
-        raise ValueError("Model does not have feature_importances_ attribute")
-    
-    importance_dict = dict(zip(feature_names, importance))
-    
-    sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
-    
-    return sorted_importance
-
-
-def get_feature_importance_from_model(model_path, feature_names):
-    import joblib
-    
-    model = joblib.load(model_path)
-    return calculate_feature_importance(model, feature_names)


 def group_comparison(dimension):
    df = get_clean_data()
-    
    dimension_map = {
-        'drinker': ('Social drinker', {0: '不饮酒', 1: '饮酒'}),
-        'smoker': ('Social smoker', {0: '不吸烟', 1: '吸烟'}),
-        'education': ('Education', {1: '高中', 2: '本科', 3: '研究生', 4: '博士'}),
-        'children': ('Son', {0: '无子女'}, lambda x: x > 0, '有子女'),
-        'pet': ('Pet', {0: '无宠物'}, lambda x: x > 0, '有宠物')
+        'industry': ('所属行业', None, '所属行业'),
+        'shift_type': ('班次类型', None, '班次类型'),
+        'job_family': ('岗位序列', None, '岗位序列'),
+        'marital_status': ('婚姻状态', None, '婚姻状态'),
+        'chronic_disease': ('是否慢性病史', {0: '无慢性病史', 1: '有慢性病史'}, '慢性病史'),
    }
-    
    if dimension not in dimension_map:
        raise ValueError(f"Invalid dimension: {dimension}")
-    
-    col, value_map = dimension_map[dimension][0], dimension_map[dimension][1]
-    
-    if dimension in ['children', 'pet']:
-        threshold_fn = dimension_map[dimension][2]
-        other_label = dimension_map[dimension][3]
-        
-        groups = []
-        for val in [0]:
-            group_df = df[df[col] == val]
-            if len(group_df) > 0:
-                groups.append({
-                    'name': value_map.get(val, str(val)),
-                    'value': val,
-                    'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                    'count': len(group_df),
-                    'percentage': round(len(group_df) / len(df) * 100, 1)
-                })
-        
-        group_df = df[df[col].apply(threshold_fn)]
-        if len(group_df) > 0:
-            groups.append({
-                'name': other_label,
-                'value': 1,
-                'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                'count': len(group_df),
-                'percentage': round(len(group_df) / len(df) * 100, 1)
-            })
-    else:
-        groups = []
-        for val in sorted(df[col].unique()):
-            group_df = df[df[col] == val]
-            if len(group_df) > 0:
-                groups.append({
-                    'name': value_map.get(val, str(val)),
-                    'value': int(val),
-                    'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                    'count': len(group_df),
-                    'percentage': round(len(group_df) / len(df) * 100, 1)
-                })
-    
-    if len(groups) >= 2:
-        diff_value = abs(groups[0]['avg_hours'] - groups[1]['avg_hours'])
-        base = min(groups[0]['avg_hours'], groups[1]['avg_hours'])
-        diff_percentage = round(diff_value / base * 100, 1) if base > 0 else 0
-    else:
-        diff_value = 0
-        diff_percentage = 0
-    
+
+    column, value_map, dimension_name = dimension_map[dimension]
+    groups = []
+    for value in sorted(df[column].unique()):
+        group_df = df[df[column] == value]
+        groups.append({
+            'name': value_map.get(value, value) if value_map else str(value),
+            'value': int(value) if isinstance(value, (int, np.integer)) else str(value),
+            'avg_hours': round(group_df[config.TARGET_COLUMN].mean(), 2),
+            'count': int(len(group_df)),
+            'percentage': round(len(group_df) / len(df) * 100, 1),
+        })
+
+    groups.sort(key=lambda item: item['avg_hours'], reverse=True)
+    top = groups[0]['avg_hours'] if groups else 0
+    bottom = groups[-1]['avg_hours'] if len(groups) > 1 else 0
+    diff_value = round(top - bottom, 2)
+    diff_percentage = round(diff_value / bottom * 100, 1) if bottom else 0
+
    return {
        'dimension': dimension,
-        'dimension_name': {
-            'drinker': '饮酒习惯',
-            'smoker': '吸烟习惯',
-            'education': '学历',
-            'children': '子女',
-            'pet': '宠物'
-        }.get(dimension, dimension),
+        'dimension_name': dimension_name,
        'groups': groups,
        'difference': {
            'value': diff_value,
-            'percentage': diff_percentage
-        }
+            'percentage': diff_percentage,
+        },
    }
-
-
-if __name__ == '__main__':
-    print("Correlation matrix:")
-    corr = get_correlation_for_heatmap()
-    print(corr)
-    
-    print("\nGroup comparison (drinker):")
-    comp = group_comparison('drinker')
-    print(comp)
@@ -0,0 +1,336 @@
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import config
+
+
+INDUSTRIES = {
+    '制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8},
+    '互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2},
+    '零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3},
+    '物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9},
+    '金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1},
+    '医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5},
+    '建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3},
+}
+
+
+def season_from_month(month):
+    if month in [12, 1, 2]:
+        return 1
+    if month in [3, 4, 5]:
+        return 2
+    if month in [6, 7, 8]:
+        return 3
+    return 4
+
+
+def weighted_choice(rng, items, probs):
+    probs = np.array(probs, dtype=float)
+    probs = probs / probs.sum()
+    return rng.choice(items, p=probs)
+
+
+def build_company_pool(rng, company_count=180):
+    industries = list(INDUSTRIES.keys())
+    scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上']
+    city_tiers = ['一线', '新一线', '二线', '三线及以下']
+    companies = []
+    for idx in range(company_count):
+        industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14])
+        companies.append({
+            '企业编号': f'C{idx + 1:03d}',
+            '所属行业': industry,
+            '企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]),
+            '所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]),
+        })
+    return companies
+
+
+def build_employee_pool(rng, companies, employee_count=2600):
+    genders = ['男', '女']
+    employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生']
+    departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营']
+    job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席']
+    job_levels = ['初级', '中级', '高级', '主管', '经理及以上']
+    educations = ['中专及以下', '大专', '本科', '硕士', '博士']
+    marital = ['未婚', '已婚', '离异/其他']
+    housing = ['自有住房', '租房', '宿舍']
+    shifts = ['标准白班', '两班倒', '三班倒', '弹性班']
+    performance = ['A', 'B', 'C', 'D']
+    stress = ['低', '中', '高']
+
+    employees = []
+    for idx in range(employee_count):
+        company = companies[rng.integers(0, len(companies))]
+        industry = company['所属行业']
+        age = int(np.clip(rng.normal(33, 7), 20, 55))
+        tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1)
+        family_bias = 0.6 if age >= 30 else 0.25
+        married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1])
+        children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3))
+        industry_profile = INDUSTRIES[industry]
+        shift = weighted_choice(
+            rng,
+            shifts,
+            [
+                max(0.1, 1 - industry_profile['shift_bias']),
+                0.35 * industry_profile['shift_bias'],
+                0.25 * industry_profile['shift_bias'],
+                0.2,
+            ],
+        )
+        night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias']))
+        overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90))
+        commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130))
+        commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65))
+        performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1])
+        chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01))
+        check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14)
+        sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1)
+        exercise = int(np.clip(rng.poisson(2.2), 0, 7))
+        smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08))
+        drinking = int(rng.random() < 0.27)
+        stress_level = weighted_choice(
+            rng,
+            stress,
+            [0.22, 0.52, 0.26 + min(0.15, overtime / 120)],
+        )
+        bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1)
+        history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8))
+        history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120))
+        discipline = int(np.clip(rng.poisson(0.2), 0, 4))
+        team_size = int(np.clip(rng.normal(11, 5), 3, 40))
+        manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60))
+        local_hukou = int(rng.random() < 0.58)
+        cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35))
+        sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3)
+
+        employees.append({
+            '企业编号': company['企业编号'],
+            '所属行业': industry,
+            '企业规模': company['企业规模'],
+            '所在城市等级': company['所在城市等级'],
+            '用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]),
+            '部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]),
+            '岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]),
+            '岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]),
+            '员工编号': f'E{idx + 1:05d}',
+            '性别': weighted_choice(rng, genders, [0.56, 0.44]),
+            '年龄': age,
+            '司龄年数': tenure,
+            '最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]),
+            '婚姻状态': married,
+            '是否本地户籍': local_hukou,
+            '子女数量': children,
+            '是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)),
+            '居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]),
+            '班次类型': shift,
+            '是否夜班岗位': night_flag,
+            '月均加班时长': round(overtime, 1),
+            '近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)),
+            '近90天缺勤次数': history_count,
+            '近180天请假总时长': round(history_hours, 1),
+            '通勤时长分钟': round(commute_minutes, 1),
+            '通勤距离公里': round(commute_km, 1),
+            '是否跨城通勤': cross_city,
+            '绩效等级': performance_level,
+            '近12月违纪次数': discipline,
+            '团队人数': team_size,
+            '直属上级管理跨度': manager_span,
+            'BMI': bmi,
+            '是否慢性病史': chronic_flag,
+            '年度体检异常标记': check_abnormal,
+            '近30天睡眠时长均值': sleep_hours,
+            '每周运动频次': exercise,
+            '是否吸烟': smoking,
+            '是否饮酒': drinking,
+            '心理压力等级': stress_level,
+            '是否长期久坐岗位': sedentary,
+        })
+    return employees
+
+
+def sample_event(rng, employee):
+    month = int(rng.integers(1, 13))
+    weekday = int(rng.integers(1, 8))
+    near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
+    leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
+    leave_type = weighted_choice(rng, leave_type_items, [0.3, 0.22, 0.12, 0.14, 0.03, 0.02, 0.06, 0.02, 0.09])
+    if employee['子女数量'] > 0 and rng.random() < 0.14:
+        reason_category = '子女照护'
+    else:
+        reason_category = weighted_choice(
+            rng,
+            ['身体不适', '家庭事务', '交通受阻', '突发事件', '职业疲劳', '就医复查'],
+            [0.28, 0.19, 0.09, 0.11, 0.2, 0.13],
+        )
+    medical_certificate = int(leave_type in ['病假', '工伤假'] or reason_category in ['身体不适', '就医复查'])
+    urgent_leave = int(rng.random() < (0.45 if leave_type in ['病假', '事假', '工伤假'] else 0.18))
+    continuous_absence = int(rng.random() < (0.2 if leave_type in ['病假', '产检育儿假', '工伤假'] else 0.08))
+    previous_overtime = int(rng.random() < min(0.85, employee['月均加班时长'] / 65))
+    season = season_from_month(month)
+    channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])
+
+    base = 0.95
+    base += min(employee['月均加班时长'] / 28, 1.8)
+    base += min(employee['通勤时长分钟'] / 65, 1.2)
+    base += employee['是否夜班岗位'] * 0.9
+    base += employee['是否慢性病史'] * 1.25
+    base += employee['年度体检异常标记'] * 0.6
+    base += 0.35 * employee['子女数量']
+    base += 0.5 if employee['心理压力等级'] == '高' else (0.2 if employee['心理压力等级'] == '中' else -0.1)
+    base += 0.4 if employee['是否跨城通勤'] else 0
+    base += 0.35 if previous_overtime else 0
+    base += 0.35 if near_holiday else 0
+    base += 0.3 if continuous_absence else 0
+    base += 0.3 if employee['近90天缺勤次数'] >= 3 else 0
+    base -= 0.35 if employee['绩效等级'] == 'A' else (0.15 if employee['绩效等级'] == 'B' else 0)
+    base -= min(employee['司龄年数'] / 40, 0.5)
+    base -= min(employee['每周运动频次'] * 0.08, 0.3)
+    base -= 0.2 if employee['近30天睡眠时长均值'] >= 7.5 else 0
+
+    leave_bonus = {
+        '病假': 2.0,
+        '事假': 0.8,
+        '年假': 0.1,
+        '调休': 0.1,
+        '婚假': 3.0,
+        '丧假': 2.8,
+        '产检育儿假': 2.4,
+        '工伤假': 3.8,
+        '其他': 0.5,
+    }
+    reason_bonus = {
+        '身体不适': 1.0,
+        '家庭事务': 0.5,
+        '子女照护': 0.8,
+        '交通受阻': 0.2,
+        '突发事件': 0.6,
+        '职业疲劳': 0.7,
+        '就医复查': 1.2,
+    }
+    industry_bonus = {
+        '制造业': 0.35,
+        '互联网': 0.2,
+        '零售连锁': 0.25,
+        '物流运输': 0.4,
+        '金融服务': 0.1,
+        '医药健康': 0.2,
+        '建筑工程': 0.35,
+    }
+    season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
+    weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}
+
+    duration = base
+    duration += leave_bonus[leave_type]
+    duration += reason_bonus[reason_category]
+    duration += industry_bonus[employee['所属行业']]
+    duration += season_bonus[season]
+    duration += weekday_bonus[weekday]
+    duration += 0.55 if medical_certificate else 0
+    duration += 0.4 if urgent_leave else -0.05
+    duration += rng.normal(0, 0.9)
+
+    if leave_type in ['婚假', '丧假', '工伤假'] and rng.random() < 0.5:
+        duration += rng.uniform(1.5, 5)
+    if leave_type == '病假' and employee['是否慢性病史'] == 1 and rng.random() < 0.35:
+        duration += rng.uniform(1, 4)
+    if leave_type in ['年假', '调休']:
+        duration *= rng.uniform(0.7, 0.95)
+
+    duration = round(float(np.clip(duration, 0.5, 24.0)), 1)
+
+    event = employee.copy()
+    event.update({
+        '缺勤月份': month,
+        '星期几': weekday,
+        '是否节假日前后': near_holiday,
+        '季节': season,
+        '请假申请渠道': channel,
+        '请假类型': leave_type,
+        '请假原因大类': reason_category,
+        '是否提供医院证明': medical_certificate,
+        '是否临时请假': urgent_leave,
+        '是否连续缺勤': continuous_absence,
+        '前一工作日是否加班': previous_overtime,
+        '缺勤时长（小时）': duration,
+    })
+    return event
+
+
+def validate_dataset(df):
+    required_columns = [
+        '员工编号',
+        '所属行业',
+        '岗位序列',
+        '月均加班时长',
+        '通勤时长分钟',
+        '是否慢性病史',
+        '请假类型',
+        '缺勤时长（小时）',
+    ]
+    for column in required_columns:
+        if column not in df.columns:
+            raise ValueError(f'Missing required column: {column}')
+
+    if len(df) < 10000:
+        raise ValueError('Synthetic dataset is smaller than expected')
+    if df['员工编号'].nunique() < 2000:
+        raise ValueError('Employee coverage is too small')
+
+    high_risk_ratio = (df['缺勤时长（小时）'] > 8).mean()
+    if not 0.15 <= high_risk_ratio <= 0.4:
+        raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}')
+
+    medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长（小时）'].mean()
+    no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长（小时）'].mean()
+    if medical_mean <= no_medical_mean:
+        raise ValueError('Medical certificate signal is not effective')
+
+    night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长（小时）'].mean()
+    day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长（小时）'].mean()
+    if night_mean <= day_mean:
+        raise ValueError('Night shift signal is not effective')
+
+
+def generate_dataset(output_path=None, sample_count=12000, random_state=None):
+    rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state)
+    companies = build_company_pool(rng)
+    employees = build_employee_pool(rng, companies)
+
+    events = []
+    employee_idx = rng.integers(0, len(employees), size=sample_count)
+    for idx in employee_idx:
+        events.append(sample_event(rng, employees[int(idx)]))
+
+    df = pd.DataFrame(events)
+    validate_dataset(df)
+
+    if output_path:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        df.to_csv(output_path, index=False, encoding='utf-8-sig')
+    return df
+
+
+def ensure_dataset():
+    if not os.path.exists(config.RAW_DATA_PATH):
+        generate_dataset(config.RAW_DATA_PATH)
+        return
+
+    try:
+        df = pd.read_csv(config.RAW_DATA_PATH)
+        validate_dataset(df)
+    except Exception:
+        generate_dataset(config.RAW_DATA_PATH)
+
+
+if __name__ == '__main__':
+    dataset = generate_dataset(config.RAW_DATA_PATH)
+    print(f'Generated dataset: {config.RAW_DATA_PATH}')
+    print(dataset.head())
@@ -0,0 +1,326 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+import config
+
+
+TARGET_COLUMN = config.TARGET_COLUMN
+ID_COLUMN = config.EMPLOYEE_ID_COLUMN
+COMPANY_COLUMN = config.COMPANY_ID_COLUMN
+LEAKY_COLUMNS = [ID_COLUMN, COMPANY_COLUMN]
+ORDINAL_COLUMNS = [
+    '企业规模',
+    '所在城市等级',
+    '岗位级别',
+    '最高学历',
+    '绩效等级',
+    '心理压力等级',
+    '工龄分层',
+    '年龄分层',
+    '通勤分层',
+    '加班分层',
+]
+NUMERICAL_OUTLIER_COLUMNS = [
+    '年龄',
+    '司龄年数',
+    '月均加班时长',
+    '近30天出勤天数',
+    '近90天缺勤次数',
+    '近180天请假总时长',
+    '通勤时长分钟',
+    '通勤距离公里',
+    '团队人数',
+    '直属上级管理跨度',
+    'BMI',
+    '近30天睡眠时长均值',
+    '每周运动频次',
+]
+DEFAULT_PREDICTION_INPUT = {
+    'industry': '制造业',
+    'company_size': '1000-4999人',
+    'city_tier': '新一线',
+    'age': 31,
+    'tenure_years': 4.5,
+    'education_level': '本科',
+    'marital_status': '已婚',
+    'job_family': '专业技术',
+    'job_level': '中级',
+    'employment_type': '正式员工',
+    'shift_type': '标准白班',
+    'is_night_shift': 0,
+    'monthly_overtime_hours': 26,
+    'attendance_days_30d': 22,
+    'absence_count_90d': 1,
+    'leave_hours_180d': 18,
+    'commute_minutes': 42,
+    'commute_km': 18,
+    'cross_city_commute': 0,
+    'performance_level': 'B',
+    'disciplinary_count_12m': 0,
+    'team_size': 10,
+    'manager_span': 14,
+    'bmi': 24.5,
+    'chronic_disease_flag': 0,
+    'annual_check_abnormal_flag': 0,
+    'sleep_hours': 7.1,
+    'exercise_frequency': 2,
+    'smoking_flag': 0,
+    'drinking_flag': 0,
+    'stress_level': '中',
+    'sedentary_job_flag': 1,
+    'local_hukou_flag': 1,
+    'children_count': 1,
+    'single_child_burden_flag': 0,
+    'absence_month': 5,
+    'weekday': 2,
+    'near_holiday_flag': 0,
+    'leave_channel': '系统申请',
+    'leave_type': '病假',
+    'leave_reason_category': '身体不适',
+    'medical_certificate_flag': 1,
+    'urgent_leave_flag': 1,
+    'continuous_absence_flag': 0,
+    'previous_day_overtime_flag': 1,
+}
+
+
+def make_target_bins(y):
+    y_series = pd.Series(y)
+    bins = pd.cut(
+        y_series,
+        bins=[0, 4, 8, 12, np.inf],
+        labels=['low', 'medium', 'high', 'extreme'],
+        include_lowest=True,
+    )
+    return bins.astype(str)
+
+
+def normalize_columns(df):
+    df = df.copy()
+    df.columns = [col.strip() for col in df.columns]
+    return df
+
+
+def prepare_modeling_dataframe(df):
+    df = normalize_columns(df)
+    drop_cols = [col for col in LEAKY_COLUMNS if col in df.columns]
+    if drop_cols:
+        df = df.drop(columns=drop_cols)
+    return df
+
+
+def fit_outlier_bounds(df, columns, lower_pct=1, upper_pct=99):
+    bounds = {}
+    for col in columns:
+        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
+            bounds[col] = (
+                float(df[col].quantile(lower_pct / 100)),
+                float(df[col].quantile(upper_pct / 100)),
+            )
+    return bounds
+
+
+def apply_outlier_bounds(df, bounds):
+    df = df.copy()
+    for col, (lower, upper) in bounds.items():
+        if col in df.columns:
+            df[col] = df[col].clip(lower, upper)
+    return df
+
+
+def engineer_features(df):
+    df = df.copy()
+    df['加班通勤压力指数'] = (
+        df['月均加班时长'] * 0.45
+        + df['通勤时长分钟'] * 0.35
+        + df['是否夜班岗位'] * 12
+        + df['前一工作日是否加班'] * 6
+    ) / 10
+    df['家庭负担指数'] = (
+        df['子女数量'] * 1.2
+        + df['是否独生子女家庭负担'] * 1.5
+        + (df['婚姻状态'] == '已婚').astype(int) * 0.6
+    )
+    df['健康风险指数'] = (
+        df['是否慢性病史'] * 2
+        + df['年度体检异常标记'] * 1.2
+        + (df['BMI'] >= 28).astype(int) * 1.1
+        + df['是否吸烟'] * 0.8
+        + df['是否饮酒'] * 0.4
+        + (df['近30天睡眠时长均值'] < 6.5).astype(int) * 1.2
+    )
+    df['岗位稳定性指数'] = (
+        df['司龄年数'] * 0.3
+        + (df['绩效等级'] == 'A').astype(int) * 1.2
+        + (df['绩效等级'] == 'B').astype(int) * 0.8
+        - df['近12月违纪次数'] * 0.7
+    )
+    df['节假日风险标记'] = (
+        (df['是否节假日前后'] == 1) | (df['请假类型'].isin(['事假', '年假', '调休']))
+    ).astype(int)
+    df['排班压力标记'] = (
+        (df['班次类型'].isin(['两班倒', '三班倒'])) | (df['是否夜班岗位'] == 1)
+    ).astype(int)
+    df['缺勤历史强度'] = df['近90天缺勤次数'] * 1.5 + df['近180天请假总时长'] / 12
+    df['生活规律指数'] = (
+        df['近30天睡眠时长均值'] * 0.6
+        + df['每周运动频次'] * 0.7
+        - df['是否吸烟'] * 1.1
+        - df['是否饮酒'] * 0.5
+    )
+    df['管理负荷指数'] = df['团队人数'] * 0.4 + df['直属上级管理跨度'] * 0.25
+
+    df['工龄分层'] = pd.cut(df['司龄年数'], bins=[0, 2, 5, 10, 40], labels=['1', '2', '3', '4'])
+    df['年龄分层'] = pd.cut(df['年龄'], bins=[18, 25, 32, 40, 60], labels=['1', '2', '3', '4'])
+    df['通勤分层'] = pd.cut(df['通勤时长分钟'], bins=[0, 25, 45, 70, 180], labels=['1', '2', '3', '4'])
+    df['加班分层'] = pd.cut(df['月均加班时长'], bins=[-1, 10, 25, 45, 120], labels=['1', '2', '3', '4'])
+    return df
+
+
+def fit_label_encoders(df, ordinal_columns=None):
+    ordinal_columns = ordinal_columns or ORDINAL_COLUMNS
+    df = df.copy()
+    encoders = {}
+    object_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    encode_columns = sorted(set(object_columns + [col for col in ordinal_columns if col in df.columns]))
+    for col in encode_columns:
+        encoder = LabelEncoder()
+        df[col] = encoder.fit_transform(df[col].astype(str))
+        encoders[col] = encoder
+    return df, encoders
+
+
+def apply_label_encoders(df, encoders):
+    df = df.copy()
+    for col, encoder in encoders.items():
+        if col not in df.columns:
+            continue
+        value_map = {cls: idx for idx, cls in enumerate(encoder.classes_)}
+        df[col] = df[col].astype(str).map(lambda value: value_map.get(value, 0))
+    return df
+
+
+def extract_xy(df):
+    y = df[TARGET_COLUMN].values if TARGET_COLUMN in df.columns else None
+    X_df = df.drop(columns=[TARGET_COLUMN]) if TARGET_COLUMN in df.columns else df.copy()
+    return X_df, y
+
+
+def build_prediction_dataframe(data):
+    feature_row = {
+        '企业编号': 'PREDICT_COMPANY',
+        '所属行业': data.get('industry', DEFAULT_PREDICTION_INPUT['industry']),
+        '企业规模': data.get('company_size', DEFAULT_PREDICTION_INPUT['company_size']),
+        '所在城市等级': data.get('city_tier', DEFAULT_PREDICTION_INPUT['city_tier']),
+        '用工类型': data.get('employment_type', DEFAULT_PREDICTION_INPUT['employment_type']),
+        '部门条线': data.get('department_line', '研发'),
+        '岗位序列': data.get('job_family', DEFAULT_PREDICTION_INPUT['job_family']),
+        '岗位级别': data.get('job_level', DEFAULT_PREDICTION_INPUT['job_level']),
+        '员工编号': 'PREDICT_EMPLOYEE',
+        '性别': data.get('gender', '男'),
+        '年龄': data.get('age', DEFAULT_PREDICTION_INPUT['age']),
+        '司龄年数': data.get('tenure_years', DEFAULT_PREDICTION_INPUT['tenure_years']),
+        '最高学历': data.get('education_level', DEFAULT_PREDICTION_INPUT['education_level']),
+        '婚姻状态': data.get('marital_status', DEFAULT_PREDICTION_INPUT['marital_status']),
+        '是否本地户籍': data.get('local_hukou_flag', DEFAULT_PREDICTION_INPUT['local_hukou_flag']),
+        '子女数量': data.get('children_count', DEFAULT_PREDICTION_INPUT['children_count']),
+        '是否独生子女家庭负担': data.get(
+            'single_child_burden_flag',
+            DEFAULT_PREDICTION_INPUT['single_child_burden_flag'],
+        ),
+        '居住类型': data.get('housing_type', '租房'),
+        '班次类型': data.get('shift_type', DEFAULT_PREDICTION_INPUT['shift_type']),
+        '是否夜班岗位': data.get('is_night_shift', DEFAULT_PREDICTION_INPUT['is_night_shift']),
+        '月均加班时长': data.get(
+            'monthly_overtime_hours',
+            DEFAULT_PREDICTION_INPUT['monthly_overtime_hours'],
+        ),
+        '近30天出勤天数': data.get(
+            'attendance_days_30d',
+            DEFAULT_PREDICTION_INPUT['attendance_days_30d'],
+        ),
+        '近90天缺勤次数': data.get('absence_count_90d', DEFAULT_PREDICTION_INPUT['absence_count_90d']),
+        '近180天请假总时长': data.get('leave_hours_180d', DEFAULT_PREDICTION_INPUT['leave_hours_180d']),
+        '通勤时长分钟': data.get('commute_minutes', DEFAULT_PREDICTION_INPUT['commute_minutes']),
+        '通勤距离公里': data.get('commute_km', DEFAULT_PREDICTION_INPUT['commute_km']),
+        '是否跨城通勤': data.get(
+            'cross_city_commute',
+            DEFAULT_PREDICTION_INPUT['cross_city_commute'],
+        ),
+        '绩效等级': data.get('performance_level', DEFAULT_PREDICTION_INPUT['performance_level']),
+        '近12月违纪次数': data.get(
+            'disciplinary_count_12m',
+            DEFAULT_PREDICTION_INPUT['disciplinary_count_12m'],
+        ),
+        '团队人数': data.get('team_size', DEFAULT_PREDICTION_INPUT['team_size']),
+        '直属上级管理跨度': data.get('manager_span', DEFAULT_PREDICTION_INPUT['manager_span']),
+        'BMI': data.get('bmi', DEFAULT_PREDICTION_INPUT['bmi']),
+        '是否慢性病史': data.get(
+            'chronic_disease_flag',
+            DEFAULT_PREDICTION_INPUT['chronic_disease_flag'],
+        ),
+        '年度体检异常标记': data.get(
+            'annual_check_abnormal_flag',
+            DEFAULT_PREDICTION_INPUT['annual_check_abnormal_flag'],
+        ),
+        '近30天睡眠时长均值': data.get('sleep_hours', DEFAULT_PREDICTION_INPUT['sleep_hours']),
+        '每周运动频次': data.get(
+            'exercise_frequency',
+            DEFAULT_PREDICTION_INPUT['exercise_frequency'],
+        ),
+        '是否吸烟': data.get('smoking_flag', DEFAULT_PREDICTION_INPUT['smoking_flag']),
+        '是否饮酒': data.get('drinking_flag', DEFAULT_PREDICTION_INPUT['drinking_flag']),
+        '心理压力等级': data.get('stress_level', DEFAULT_PREDICTION_INPUT['stress_level']),
+        '是否长期久坐岗位': data.get(
+            'sedentary_job_flag',
+            DEFAULT_PREDICTION_INPUT['sedentary_job_flag'],
+        ),
+        '缺勤月份': data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month']),
+        '星期几': data.get('weekday', DEFAULT_PREDICTION_INPUT['weekday']),
+        '是否节假日前后': data.get('near_holiday_flag', DEFAULT_PREDICTION_INPUT['near_holiday_flag']),
+        '季节': _season_from_month(data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month'])),
+        '请假申请渠道': data.get('leave_channel', DEFAULT_PREDICTION_INPUT['leave_channel']),
+        '请假类型': data.get('leave_type', DEFAULT_PREDICTION_INPUT['leave_type']),
+        '请假原因大类': data.get(
+            'leave_reason_category',
+            DEFAULT_PREDICTION_INPUT['leave_reason_category'],
+        ),
+        '是否提供医院证明': data.get(
+            'medical_certificate_flag',
+            DEFAULT_PREDICTION_INPUT['medical_certificate_flag'],
+        ),
+        '是否临时请假': data.get('urgent_leave_flag', DEFAULT_PREDICTION_INPUT['urgent_leave_flag']),
+        '是否连续缺勤': data.get(
+            'continuous_absence_flag',
+            DEFAULT_PREDICTION_INPUT['continuous_absence_flag'],
+        ),
+        '前一工作日是否加班': data.get(
+            'previous_day_overtime_flag',
+            DEFAULT_PREDICTION_INPUT['previous_day_overtime_flag'],
+        ),
+    }
+    return pd.DataFrame([feature_row])
+
+
+def _season_from_month(month):
+    month = int(month)
+    if month in [12, 1, 2]:
+        return 1
+    if month in [3, 4, 5]:
+        return 2
+    if month in [6, 7, 8]:
+        return 3
+    return 4
+
+
+def align_feature_frame(df, feature_names):
+    aligned = df.copy()
+    for feature in feature_names:
+        if feature not in aligned.columns:
+            aligned[feature] = 0
+    return aligned[feature_names]
+
+
+def to_float_array(df):
+    return df.values.astype(float)
@@ -1,10 +1,11 @@
-import pandas as pd
-import numpy as np
-from sklearn.preprocessing import StandardScaler
-import joblib
 import os

+import joblib
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
 import config
+from core.generate_dataset import ensure_dataset


 class DataPreprocessor:
@@ -12,67 +13,57 @@ class DataPreprocessor:
        self.scaler = StandardScaler()
        self.is_fitted = False
        self.feature_names = None
-        
+
    def load_raw_data(self):
+        ensure_dataset()
        df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR)
        df.columns = df.columns.str.strip()
        return df
-    
+
    def clean_data(self, df):
        df = df.copy()
-        
        df = df.drop_duplicates()
-        
+
        for col in df.columns:
-            if df[col].isnull().sum() > 0:
-                if df[col].dtype in ['int64', 'float64']:
-                    df[col].fillna(df[col].median(), inplace=True)
-                else:
-                    df[col].fillna(df[col].mode()[0], inplace=True)
-        
+            if df[col].isnull().sum() == 0:
+                continue
+            if pd.api.types.is_numeric_dtype(df[col]):
+                df[col] = df[col].fillna(df[col].median())
+            else:
+                df[col] = df[col].fillna(df[col].mode()[0])
+
        return df
-    
+
    def fit_transform(self, df):
        df = self.clean_data(df)
-        
-        if 'Absenteeism time in hours' in df.columns:
-            y = df['Absenteeism time in hours'].values
-            feature_df = df.drop(columns=['Absenteeism time in hours'])
+        if config.TARGET_COLUMN in df.columns:
+            y = df[config.TARGET_COLUMN].values
+            feature_df = df.drop(columns=[config.TARGET_COLUMN])
        else:
            y = None
            feature_df = df
-        
+
        self.feature_names = list(feature_df.columns)
-        
-        X = feature_df.values
-        
-        X = self.scaler.fit_transform(X)
-        
+        X = self.scaler.fit_transform(feature_df.values)
        self.is_fitted = True
-        
        return X, y
-    
+
    def transform(self, df):
        if not self.is_fitted:
            raise ValueError("Preprocessor has not been fitted yet.")
-        
+
        df = self.clean_data(df)
-        
-        if 'Absenteeism time in hours' in df.columns:
-            feature_df = df.drop(columns=['Absenteeism time in hours'])
+        if config.TARGET_COLUMN in df.columns:
+            feature_df = df.drop(columns=[config.TARGET_COLUMN])
        else:
            feature_df = df
-        
-        X = feature_df.values
-        X = self.scaler.transform(X)
-        
-        return X
-    
+        return self.scaler.transform(feature_df.values)
+
    def save_preprocessor(self):
        os.makedirs(config.MODELS_DIR, exist_ok=True)
        joblib.dump(self.scaler, config.SCALER_PATH)
        joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
-    
+
    def load_preprocessor(self):
        self.scaler = joblib.load(config.SCALER_PATH)
        feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
@@ -84,22 +75,18 @@ class DataPreprocessor:
 def get_clean_data():
    preprocessor = DataPreprocessor()
    df = preprocessor.load_raw_data()
-    df = preprocessor.clean_data(df)
-    return df
+    return preprocessor.clean_data(df)


 def save_clean_data():
    preprocessor = DataPreprocessor()
    df = preprocessor.load_raw_data()
    df = preprocessor.clean_data(df)
-    
    os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
    df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',')
-    
    return df


 if __name__ == '__main__':
-    df = save_clean_data()
-    print(f"Clean data saved. Shape: {df.shape}")
-    print(df.head())
+    data = save_clean_data()
+    print(f"Clean data saved. Shape: {data.shape}")
@@ -1,123 +1,57 @@
-import sys
 import os
+import sys
+import time
+from datetime import datetime
+
+import joblib
+import numpy as np
+from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
+from sklearn.feature_selection import SelectKBest, f_regression
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.model_selection import RandomizedSearchCV, train_test_split
+from sklearn.preprocessing import RobustScaler
+
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-import pandas as pd
-import numpy as np
-import time
-from sklearn.ensemble import (
-    RandomForestRegressor, 
-    GradientBoostingRegressor,
-    ExtraTreesRegressor,
-    StackingRegressor
-)
-from sklearn.linear_model import Ridge
-from sklearn.model_selection import train_test_split, RandomizedSearchCV
-from sklearn.preprocessing import RobustScaler, LabelEncoder
-from sklearn.feature_selection import SelectKBest, f_regression
-from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
-import xgboost as xgb
-import lightgbm as lgb
-import joblib
-import warnings
-warnings.filterwarnings('ignore')
-
 import config
+from core.model_features import (
+    NUMERICAL_OUTLIER_COLUMNS,
+    ORDINAL_COLUMNS,
+    TARGET_COLUMN,
+    align_feature_frame,
+    apply_label_encoders,
+    apply_outlier_bounds,
+    engineer_features,
+    extract_xy,
+    fit_label_encoders,
+    fit_outlier_bounds,
+    make_target_bins,
+    normalize_columns,
+    prepare_modeling_dataframe,
+    to_float_array,
+)
 from core.preprocessing import get_clean_data

+try:
+    import lightgbm as lgb
+except ImportError:
+    lgb = None
+
+try:
+    import xgboost as xgb
+except ImportError:
+    xgb = None
+

 def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
    elapsed = time.time() - start_time
-    print(f"  {'─'*50}")
-    print(f"  Model: {model_name}")
-    print(f"  Time: {elapsed:.1f}s")
-    print(f"  Best CV R2: {best_score:.4f}")
-    print(f"  Best params:")
-    for k, v in best_params.items():
-        print(f"    - {k}: {v}")
-    print(f"  Iterations: {n_iter}, CV folds: {cv_folds}")
-    print(f"  {'─'*50}")
-
-
-class DataAugmenter:
-    def __init__(self, noise_level=0.02, n_augment=2):
-        self.noise_level = noise_level
-        self.n_augment = n_augment
-    
-    def augment(self, df, target_col='Absenteeism time in hours'):
-        print(f"\nData Augmentation...")
-        print(f"  Original size: {len(df)}")
-        
-        augmented_dfs = [df]
-        
-        numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        if target_col in numerical_cols:
-            numerical_cols.remove(target_col)
-        
-        for i in range(self.n_augment):
-            df_aug = df.copy()
-            
-            for col in numerical_cols:
-                if col in df_aug.columns:
-                    std_val = df_aug[col].std()
-                    if std_val > 0:
-                        noise = np.random.normal(0, self.noise_level * std_val, len(df_aug))
-                        df_aug[col] = df_aug[col] + noise
-            
-            augmented_dfs.append(df_aug)
-        
-        df_result = pd.concat(augmented_dfs, ignore_index=True)
-        print(f"  Augmented size: {len(df_result)}")
-        
-        return df_result
-    
-    def smote_regression(self, df, target_col='Absenteeism time in hours'):
-        df = df.copy()
-        y = df[target_col].values
-        
-        bins = [0, 1, 4, 8, 100]
-        labels = ['zero', 'low', 'medium', 'high']
-        df['_target_bin'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
-        
-        bin_counts = df['_target_bin'].value_counts()
-        max_count = bin_counts.max()
-        
-        numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        if target_col in numerical_cols:
-            numerical_cols.remove(target_col)
-        if '_target_bin' in numerical_cols:
-            numerical_cols.remove('_target_bin')
-        
-        augmented_rows = []
-        for bin_label in labels:
-            bin_df = df[df['_target_bin'] == bin_label].drop(columns=['_target_bin'])
-            bin_size = len(bin_df)
-            
-            if bin_size < max_count and bin_size > 0:
-                n_samples_to_add = max_count - bin_size
-                
-                for _ in range(n_samples_to_add):
-                    idx = np.random.choice(bin_df.index)
-                    sample = bin_df.loc[idx].copy()
-                    
-                    for col in numerical_cols:
-                        if col in sample.index:
-                            std_val = bin_df[col].std()
-                            if std_val > 0:
-                                noise = np.random.normal(0, 0.02 * std_val)
-                                sample[col] = sample[col] + noise
-                    
-                    augmented_rows.append(sample)
-        
-        if augmented_rows:
-            df_aug = pd.DataFrame(augmented_rows)
-            df_result = pd.concat([df.drop(columns=['_target_bin']), df_aug], ignore_index=True)
-        else:
-            df_result = df.drop(columns=['_target_bin'])
-        
-        print(f"  After SMOTE-like augmentation: {len(df_result)}")
-        
-        return df_result
+    print(f'  {"-" * 50}')
+    print(f'  Model: {model_name}')
+    print(f'  Time: {elapsed:.1f}s')
+    print(f'  Best CV R2: {best_score:.4f}')
+    for key, value in best_params.items():
+        print(f'    - {key}: {value}')
+    print(f'  Iterations: {n_iter}, CV folds: {cv_folds}')


 class OptimizedModelTrainer:
@@ -128,461 +62,237 @@ class OptimizedModelTrainer:
        self.selected_features = None
        self.label_encoders = {}
        self.model_metrics = {}
-        self.augmenter = DataAugmenter(noise_level=0.02, n_augment=2)
-        
+        self.training_metadata = {}
+        self.feature_selector = None
+        self.outlier_bounds = {}
+        self.feature_k = 22
+        self.target_transform = 'log1p'
+        self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
+
    def analyze_data(self, df):
-        print("\n" + "="*60)
-        print("Data Analysis")
-        print("="*60)
-        
-        y = df['Absenteeism time in hours']
-        
-        print(f"\nTarget variable statistics:")
-        print(f"  Min: {y.min()}")
-        print(f"  Max: {y.max()}")
-        print(f"  Mean: {y.mean():.2f}")
-        print(f"  Median: {y.median():.2f}")
-        print(f"  Std: {y.std():.2f}")
-        print(f"  Skewness: {y.skew():.2f}")
-        
-        print(f"\nTarget distribution:")
-        print(f"  Zero values: {(y == 0).sum()} ({(y == 0).sum() / len(y) * 100:.1f}%)")
-        print(f"  1-8 hours: {((y > 0) & (y <= 8)).sum()} ({((y > 0) & (y <= 8)).sum() / len(y) * 100:.1f}%)")
-        print(f"  >8 hours: {(y > 8).sum()} ({(y > 8).sum() / len(y) * 100:.1f}%)")
-        
-        return y
-    
-    def clip_outliers(self, df, columns, lower_pct=1, upper_pct=99):
-        df_clean = df.copy()
-        
-        for col in columns:
-            if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']:
-                if col == 'Absenteeism time in hours':
-                    continue
-                lower = df_clean[col].quantile(lower_pct / 100)
-                upper = df_clean[col].quantile(upper_pct / 100)
-                df_clean[col] = df_clean[col].clip(lower, upper)
-        
-        return df_clean
-    
-    def feature_engineering(self, df):
-        df = df.copy()
-        
-        df['workload_per_age'] = df['Work load Average/day'] / (df['Age'] + 1)
-        df['expense_per_distance'] = df['Transportation expense'] / (df['Distance from Residence to Work'] + 1)
-        df['age_service_ratio'] = df['Age'] / (df['Service time'] + 1)
-        
-        df['has_children'] = (df['Son'] > 0).astype(int)
-        df['has_pet'] = (df['Pet'] > 0).astype(int)
-        df['family_responsibility'] = df['Son'] + df['Pet']
-        
-        df['health_risk'] = ((df['Social drinker'] == 1) | (df['Social smoker'] == 1) | (df['Body mass index'] > 30)).astype(int)
-        df['lifestyle_risk'] = df['Social drinker'].astype(int) + df['Social smoker'].astype(int)
-        
-        df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=[1, 2, 3, 4])
-        df['service_group'] = pd.cut(df['Service time'], bins=[0, 5, 10, 20, 100], labels=[1, 2, 3, 4])
-        df['bmi_category'] = pd.cut(df['Body mass index'], bins=[0, 18.5, 25, 30, 100], labels=[1, 2, 3, 4])
-        
-        df['workload_category'] = pd.cut(df['Work load Average/day'], bins=[0, 200, 250, 300, 500], labels=[1, 2, 3, 4])
-        df['commute_category'] = pd.cut(df['Distance from Residence to Work'], bins=[0, 10, 20, 50, 100], labels=[1, 2, 3, 4])
-        
-        df['seasonal_risk'] = df['Seasons'].apply(lambda x: 1 if x in [1, 3] else 0)
-        df['weekday_risk'] = df['Day of the week'].apply(lambda x: 1 if x in [2, 6] else 0)
-        
-        df['hit_target_ratio'] = df['Hit target'] / 100
-        df['experience_level'] = pd.cut(df['Service time'], bins=[0, 5, 10, 15, 100], labels=[1, 2, 3, 4])
-        
-        df['age_workload_interaction'] = df['Age'] * df['Work load Average/day'] / 10000
-        df['service_bmi_interaction'] = df['Service time'] * df['Body mass index'] / 100
-        
-        return df
-    
+        y = df[TARGET_COLUMN]
+        print('\nData Analysis')
+        print(f'  Samples: {len(df)}')
+        print(f'  Mean: {y.mean():.2f}, Median: {y.median():.2f}, Std: {y.std():.2f}')
+        print(f'  High risk ratio (>8h): {(y > 8).mean() * 100:.1f}%')
+
    def select_features(self, X, y, k=20):
-        print("\nFeature Selection...")
-        
        selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
        selector.fit(X, y)
-        
-        scores = selector.scores_
-        feature_scores = list(zip(self.feature_names, scores))
-        feature_scores.sort(key=lambda x: x[1], reverse=True)
-        
-        print(f"\nTop {min(k, len(feature_scores))} features by F-score:")
-        for i, (name, score) in enumerate(feature_scores[:min(k, len(feature_scores))]):
-            cn = config.FEATURE_NAME_CN.get(name, name)
-            print(f"  {i+1}. {cn}: {score:.2f}")
-        
-        selected_mask = selector.get_support()
-        self.selected_features = [f for f, s in zip(self.feature_names, selected_mask) if s]
-        
+        self.feature_selector = selector
+        mask = selector.get_support()
+        self.selected_features = [name for name, keep in zip(self.feature_names, mask) if keep]
        return selector.transform(X)
-    
+
+    def transform_target(self, y):
+        return np.log1p(np.clip(y, a_min=0, a_max=None)) if self.target_transform == 'log1p' else y
+
+    def inverse_transform_target(self, y_pred):
+        return np.expm1(y_pred) if self.target_transform == 'log1p' else y_pred
+
+    def transform_features(self, X_df):
+        X_df = align_feature_frame(X_df, self.feature_names)
+        X = self.scaler.transform(to_float_array(X_df))
+        return self.feature_selector.transform(X) if self.feature_selector else X
+
    def prepare_data(self):
-        df = get_clean_data()
-        df.columns = [col.strip() for col in df.columns]
-        
-        df = df.drop(columns=['ID'])
-        
-        cols_to_drop = ['Weight', 'Height', 'Reason for absence']
-        for col in cols_to_drop:
-            if col in df.columns:
-                df = df.drop(columns=[col])
-        print("  Removed features: Weight, Height, Reason for absence (data leakage risk)")
-        
+        df = normalize_columns(get_clean_data())
+        df = prepare_modeling_dataframe(df)
        self.analyze_data(df)
-        
-        print("\n" + "="*60)
-        print("Data Preprocessing")
-        print("="*60)
-        
-        numerical_cols = ['Age', 'Service time', 'Work load Average/day', 
-                         'Transportation expense', 'Distance from Residence to Work',
-                         'Hit target', 'Body mass index']
-        df = self.clip_outliers(df, numerical_cols)
-        print("  Outliers clipped (1st-99th percentile)")
-        
-        print("\n" + "="*60)
-        print("Data Augmentation")
-        print("="*60)
-        
-        df = self.augmenter.smote_regression(df)
-        df = self.augmenter.augment(df)
-        
-        print("\n" + "="*60)
-        print("Feature Engineering")
-        print("="*60)
-        
-        df = self.feature_engineering(df)
-        
-        y = df['Absenteeism time in hours'].values
-        X_df = df.drop(columns=['Absenteeism time in hours'])
-        
-        ordinal_cols = ['Month of absence', 'Day of the week', 'Seasons', 
-                       'Disciplinary failure', 'Education', 'Social drinker', 
-                       'Social smoker', 'age_group', 'service_group', 
-                       'bmi_category', 'workload_category', 'commute_category',
-                       'experience_level']
-        
-        for col in ordinal_cols:
-            if col in X_df.columns:
-                le = LabelEncoder()
-                X_df[col] = le.fit_transform(X_df[col].astype(str))
-                self.label_encoders[col] = le
-        
-        self.feature_names = list(X_df.columns)
-        
-        X = X_df.values.astype(float)
-        
-        X = self.scaler.fit_transform(X)
-        
-        X = self.select_features(X, y, k=20)
-        
-        print(f"\nFinal feature count: {X.shape[1]}")
-        
-        X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=0.2, random_state=42
+
+        target_bins = make_target_bins(df[TARGET_COLUMN].values)
+        train_df, test_df = train_test_split(
+            df,
+            test_size=config.TEST_SIZE,
+            random_state=config.RANDOM_STATE,
+            stratify=target_bins,
        )
-        
+        train_df = train_df.reset_index(drop=True)
+        test_df = test_df.reset_index(drop=True)
+
+        self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
+        train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
+        test_df = apply_outlier_bounds(test_df, self.outlier_bounds)
+
+        train_df = engineer_features(train_df)
+        test_df = engineer_features(test_df)
+        X_train_df, y_train = extract_xy(train_df)
+        X_test_df, y_test = extract_xy(test_df)
+
+        X_train_df, self.label_encoders = fit_label_encoders(X_train_df, ORDINAL_COLUMNS)
+        X_test_df = apply_label_encoders(X_test_df, self.label_encoders)
+
+        self.feature_names = list(X_train_df.columns)
+        X_test_df = align_feature_frame(X_test_df, self.feature_names)
+        X_train = self.scaler.fit_transform(to_float_array(X_train_df))
+        X_test = self.scaler.transform(to_float_array(X_test_df))
+
+        transformed_target = self.transform_target(y_train)
+        X_train = self.select_features(X_train, transformed_target, k=self.feature_k)
+        X_test = self.transform_features(X_test_df)
+
+        self.training_metadata = {
+            'train_samples': int(len(train_df)),
+            'test_samples': int(len(test_df)),
+            'feature_count_before_selection': int(len(self.feature_names)),
+            'feature_count_after_selection': int(X_train.shape[1]),
+            'training_date': datetime.now().strftime('%Y-%m-%d'),
+            'target_transform': self.target_transform,
+            'available_models': list(self.enabled_models),
+        }
        return X_train, X_test, y_train, y_test
-    
+
+    def _run_search(self, name, estimator, params, X_train, y_train, n_iter=12):
+        start_time = time.time()
+        search = RandomizedSearchCV(
+            estimator,
+            param_distributions=params,
+            n_iter=n_iter,
+            cv=4,
+            scoring='r2',
+            n_jobs=-1,
+            random_state=config.RANDOM_STATE,
+        )
+        search.fit(X_train, y_train)
+        self.models[name] = search.best_estimator_
+        print_training_log(name, start_time, search.best_score_, search.best_params_, n_iter, 4)
+
    def train_random_forest(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training Random Forest")
-        print("="*60)
-        
-        start_time = time.time()
-        rf = RandomForestRegressor(random_state=42, n_jobs=-1)
-        
-        param_distributions = {
-            'n_estimators': [200, 300, 400],
-            'max_depth': [10, 15, 20, 25],
-            'min_samples_split': [2, 5, 10],
-            'min_samples_leaf': [1, 2, 4],
-            'max_features': ['sqrt', 0.7]
-        }
-        
-        print(f"  Searching {20*5} parameter combinations...")
-        random_search = RandomizedSearchCV(
-            rf, param_distributions, n_iter=20, cv=5, 
-            scoring='r2', n_jobs=-1, random_state=42
+        self._run_search(
+            'random_forest',
+            RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
+            {
+                'n_estimators': [200, 300, 400],
+                'max_depth': [10, 14, 18, None],
+                'min_samples_split': [2, 4, 8],
+                'min_samples_leaf': [1, 2, 3],
+                'max_features': ['sqrt', 0.7],
+            },
+            X_train,
+            y_train,
        )
-        random_search.fit(X_train, y_train)
-        
-        self.models['random_forest'] = random_search.best_estimator_
-        print_training_log("Random Forest", start_time, random_search.best_score_, 
-                          random_search.best_params_, 20, 5)
-        
-        return random_search.best_estimator_
-    
-    def train_xgboost(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training XGBoost")
-        print("="*60)
-        
-        start_time = time.time()
-        xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
-        
-        param_distributions = {
-            'n_estimators': [200, 300, 400],
-            'max_depth': [5, 7, 9],
-            'learning_rate': [0.05, 0.1],
-            'subsample': [0.7, 0.8],
-            'colsample_bytree': [0.7, 0.8],
-            'min_child_weight': [1, 3],
-            'reg_alpha': [0, 0.1],
-            'reg_lambda': [1, 1.5]
-        }
-        
-        print(f"  Searching {20*5} parameter combinations...")
-        random_search = RandomizedSearchCV(
-            xgb_model, param_distributions, n_iter=20, cv=5,
-            scoring='r2', n_jobs=-1, random_state=42
-        )
-        random_search.fit(X_train, y_train)
-        
-        self.models['xgboost'] = random_search.best_estimator_
-        print_training_log("XGBoost", start_time, random_search.best_score_,
-                          random_search.best_params_, 20, 5)
-        
-        return random_search.best_estimator_
-    
-    def train_lightgbm(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training LightGBM")
-        print("="*60)
-        
-        start_time = time.time()
-        lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
-        
-        param_distributions = {
-            'n_estimators': [200, 300, 400],
-            'max_depth': [7, 9, 11, -1],
-            'learning_rate': [0.05, 0.1],
-            'subsample': [0.7, 0.8],
-            'colsample_bytree': [0.7, 0.8],
-            'min_child_samples': [5, 10, 20],
-            'reg_alpha': [0, 0.1],
-            'reg_lambda': [1, 1.5],
-            'num_leaves': [31, 50, 70]
-        }
-        
-        print(f"  Searching {20*5} parameter combinations...")
-        random_search = RandomizedSearchCV(
-            lgb_model, param_distributions, n_iter=20, cv=5,
-            scoring='r2', n_jobs=-1, random_state=42
-        )
-        random_search.fit(X_train, y_train)
-        
-        self.models['lightgbm'] = random_search.best_estimator_
-        print_training_log("LightGBM", start_time, random_search.best_score_,
-                          random_search.best_params_, 20, 5)
-        
-        return random_search.best_estimator_
-    
+
    def train_gradient_boosting(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training Gradient Boosting")
-        print("="*60)
-        
-        start_time = time.time()
-        gb = GradientBoostingRegressor(random_state=42)
-        
-        param_distributions = {
-            'n_estimators': [200, 300],
-            'max_depth': [5, 7, 9],
-            'learning_rate': [0.05, 0.1],
-            'subsample': [0.7, 0.8],
-            'min_samples_split': [2, 5],
-            'min_samples_leaf': [1, 2]
-        }
-        
-        print(f"  Searching {15*5} parameter combinations...")
-        random_search = RandomizedSearchCV(
-            gb, param_distributions, n_iter=15, cv=5,
-            scoring='r2', n_jobs=-1, random_state=42
+        self._run_search(
+            'gradient_boosting',
+            GradientBoostingRegressor(random_state=config.RANDOM_STATE),
+            {
+                'n_estimators': [160, 220, 300],
+                'max_depth': [3, 4, 5],
+                'learning_rate': [0.03, 0.05, 0.08],
+                'subsample': [0.7, 0.85, 1.0],
+                'min_samples_split': [2, 4, 6],
+                'min_samples_leaf': [1, 2, 3],
+            },
+            X_train,
+            y_train,
        )
-        random_search.fit(X_train, y_train)
-        
-        self.models['gradient_boosting'] = random_search.best_estimator_
-        print_training_log("Gradient Boosting", start_time, random_search.best_score_,
-                          random_search.best_params_, 15, 5)
-        
-        return random_search.best_estimator_
-    
+
    def train_extra_trees(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training Extra Trees")
-        print("="*60)
-        
-        start_time = time.time()
-        et = ExtraTreesRegressor(random_state=42, n_jobs=-1)
-        
-        param_distributions = {
-            'n_estimators': [200, 300, 400],
-            'max_depth': [10, 15, 20],
-            'min_samples_split': [2, 5, 10],
-            'min_samples_leaf': [1, 2, 4],
-            'max_features': ['sqrt', 0.7]
-        }
-        
-        print(f"  Searching {20*5} parameter combinations...")
-        random_search = RandomizedSearchCV(
-            et, param_distributions, n_iter=20, cv=5,
-            scoring='r2', n_jobs=-1, random_state=42
+        self._run_search(
+            'extra_trees',
+            ExtraTreesRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
+            {
+                'n_estimators': [220, 320, 420],
+                'max_depth': [10, 15, 20, None],
+                'min_samples_split': [2, 4, 8],
+                'min_samples_leaf': [1, 2, 3],
+                'max_features': ['sqrt', 0.7],
+            },
+            X_train,
+            y_train,
        )
-        random_search.fit(X_train, y_train)
-        
-        self.models['extra_trees'] = random_search.best_estimator_
-        print_training_log("Extra Trees", start_time, random_search.best_score_,
-                          random_search.best_params_, 20, 5)
-        
-        return random_search.best_estimator_
-    
-    def train_stacking(self, X_train, y_train):
-        print("\n" + "="*60)
-        print("Training Stacking Ensemble")
-        print("="*60)
-        
-        start_time = time.time()
-        base_estimators = []
-        
-        if 'random_forest' in self.models:
-            base_estimators.append(('rf', self.models['random_forest']))
-        if 'xgboost' in self.models:
-            base_estimators.append(('xgb', self.models['xgboost']))
-        if 'lightgbm' in self.models:
-            base_estimators.append(('lgb', self.models['lightgbm']))
-        if 'gradient_boosting' in self.models:
-            base_estimators.append(('gb', self.models['gradient_boosting']))
-        
-        if len(base_estimators) < 2:
-            print("  Not enough base models for stacking")
-            return None
-        
-        print(f"  Base estimators: {[name for name, _ in base_estimators]}")
-        print(f"  Meta learner: Ridge")
-        print(f"  CV folds: 5")
-        
-        stacking = StackingRegressor(
-            estimators=base_estimators,
-            final_estimator=Ridge(alpha=1.0),
-            cv=5,
-            n_jobs=-1
+
+    def train_lightgbm(self, X_train, y_train):
+        if lgb is None:
+            return
+        self._run_search(
+            'lightgbm',
+            lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
+            {
+                'n_estimators': [180, 260, 340],
+                'max_depth': [7, 9, -1],
+                'learning_rate': [0.03, 0.05, 0.08],
+                'subsample': [0.7, 0.85, 1.0],
+                'colsample_bytree': [0.7, 0.85, 1.0],
+                'num_leaves': [31, 50, 70],
+            },
+            X_train,
+            y_train,
        )
-        stacking.fit(X_train, y_train)
-        
-        self.models['stacking'] = stacking
-        elapsed = time.time() - start_time
-        print(f"  {'─'*50}")
-        print(f"  Stacking ensemble created in {elapsed:.1f}s")
-        print(f"  {'─'*50}")
-        
-        return stacking
-    
+
+    def train_xgboost(self, X_train, y_train):
+        if xgb is None:
+            return
+        self._run_search(
+            'xgboost',
+            xgb.XGBRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
+            {
+                'n_estimators': [180, 260, 340],
+                'max_depth': [4, 6, 8],
+                'learning_rate': [0.03, 0.05, 0.08],
+                'subsample': [0.7, 0.85, 1.0],
+                'colsample_bytree': [0.7, 0.85, 1.0],
+                'min_child_weight': [1, 3, 5],
+            },
+            X_train,
+            y_train,
+        )
+
    def evaluate_model(self, model, X_test, y_test):
-        y_pred = model.predict(X_test)
-        
-        r2 = r2_score(y_test, y_pred)
+        y_pred = self.inverse_transform_target(model.predict(X_test))
+        y_pred = np.clip(y_pred, a_min=0, a_max=None)
        mse = mean_squared_error(y_test, y_pred)
-        rmse = np.sqrt(mse)
-        mae = mean_absolute_error(y_test, y_pred)
-        
        return {
-            'r2': round(r2, 4),
+            'r2': round(r2_score(y_test, y_pred), 4),
            'mse': round(mse, 4),
-            'rmse': round(rmse, 4),
-            'mae': round(mae, 4)
+            'rmse': round(np.sqrt(mse), 4),
+            'mae': round(mean_absolute_error(y_test, y_pred), 4),
        }
-    
+
    def save_models(self):
        os.makedirs(config.MODELS_DIR, exist_ok=True)
-        
        for name, model in self.models.items():
-            if model is not None:
-                model_path = os.path.join(config.MODELS_DIR, f'{name}_model.pkl')
-                joblib.dump(model, model_path)
-                print(f"  {name} saved")
-        
+            joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
        joblib.dump(self.scaler, config.SCALER_PATH)
        joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
        joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
        joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
        joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
-        print("  Scaler and feature info saved")
-    
+        joblib.dump(self.training_metadata, os.path.join(config.MODELS_DIR, 'training_metadata.pkl'))
+
    def train_all(self):
-        total_start = time.time()
-        print("\n" + "="*60)
-        print("Optimized Model Training Started")
-        print("="*60)
-        print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
-        
+        print('\nOptimized Model Training Started')
        X_train, X_test, y_train, y_test = self.prepare_data()
-        
-        print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")
-        
-        print("\n" + "="*60)
-        print("Training Models with Hyperparameter Optimization")
-        print("="*60)
-        
-        self.train_random_forest(X_train, y_train)
-        self.train_extra_trees(X_train, y_train)
-        self.train_xgboost(X_train, y_train)
-        self.train_lightgbm(X_train, y_train)
-        self.train_gradient_boosting(X_train, y_train)
-        self.train_stacking(X_train, y_train)
-        
-        print("\n" + "="*60)
-        print("Evaluating Models on Test Set")
-        print("="*60)
-        
-        best_r2 = -float('inf')
-        best_model = None
-        
+        y_train_transformed = self.transform_target(y_train)
+
+        if 'random_forest' in self.enabled_models:
+            self.train_random_forest(X_train, y_train_transformed)
+        if 'gradient_boosting' in self.enabled_models:
+            self.train_gradient_boosting(X_train, y_train_transformed)
+        if 'extra_trees' in self.enabled_models:
+            self.train_extra_trees(X_train, y_train_transformed)
+        if 'lightgbm' in self.enabled_models:
+            self.train_lightgbm(X_train, y_train_transformed)
+        if 'xgboost' in self.enabled_models:
+            self.train_xgboost(X_train, y_train_transformed)
+
        for name, model in self.models.items():
-            if model is not None:
-                metrics = self.evaluate_model(model, X_test, y_test)
-                self.model_metrics[name] = metrics
-                
-                status = "Good" if metrics['r2'] > 0.5 else ("OK" if metrics['r2'] > 0.3 else "Poor")
-                status_icon = "✓" if status == "Good" else ("△" if status == "OK" else "✗")
-                print(f"  {status_icon} {name:20s} - R2: {metrics['r2']:.4f}, RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}")
-                
-                if metrics['r2'] > best_r2:
-                    best_r2 = metrics['r2']
-                    best_model = name
-        
-        print(f"\n  ★ Best Model: {best_model} (R2 = {best_r2:.4f})")
-        
-        print("\n" + "="*60)
-        print("Saving Models")
-        print("="*60)
+            metrics = self.evaluate_model(model, X_test, y_test)
+            self.model_metrics[name] = metrics
+            print(f'  {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
+
        self.save_models()
-        
        return self.model_metrics


 def train_and_save_models():
-    total_start = time.time()
+    start = time.time()
    trainer = OptimizedModelTrainer()
    metrics = trainer.train_all()
-    total_elapsed = time.time() - total_start
-    
-    print("\n" + "="*60)
-    print("Training Complete!")
-    print("="*60)
-    print(f"Total training time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} min)")
-    print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
-    
-    print("\n" + "-"*60)
-    print("Final Model Ranking (by R2)")
-    print("-"*60)
-    
-    sorted_metrics = sorted(metrics.items(), key=lambda x: x[1]['r2'], reverse=True)
-    for i, (name, m) in enumerate(sorted_metrics, 1):
-        medal = "🥇" if i == 1 else ("🥈" if i == 2 else ("🥉" if i == 3 else "  "))
-        print(f"  {medal} {i}. {name:20s} - R2: {m['r2']:.4f}, RMSE: {m['rmse']:.4f}")
-    
+    print(f'\nTraining Complete in {time.time() - start:.1f}s')
+    for idx, (name, metric) in enumerate(sorted(metrics.items(), key=lambda item: item[1]['r2'], reverse=True), start=1):
+        print(f'{idx}. {name} - R2={metric["r2"]:.4f}')
    return metrics


@@ -1,6 +1,6 @@
 import os
+
 import joblib
-import numpy as np

 import config
 from core.feature_mining import get_correlation_for_heatmap, group_comparison
@@ -10,109 +10,95 @@ class AnalysisService:
    def __init__(self):
        self.models = {}
        self.feature_names = None
-    
+        self.selected_features = None
+        self.training_metadata = {}
+
    def _ensure_models_loaded(self):
-        if not self.models:
-            model_files = {
-                'random_forest': 'random_forest_model.pkl',
-                'xgboost': 'xgboost_model.pkl',
-                'lightgbm': 'lightgbm_model.pkl',
-            }
-            
-            for name, filename in model_files.items():
-                model_path = os.path.join(config.MODELS_DIR, filename)
-                if os.path.exists(model_path):
-                    try:
-                        self.models[name] = joblib.load(model_path)
-                    except Exception as e:
-                        print(f"Failed to load {name}: {e}")
-            
-            feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
-            if os.path.exists(feature_names_path):
-                self.feature_names = joblib.load(feature_names_path)
-    
+        if self.models:
+            return
+        metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
+        if os.path.exists(metadata_path):
+            self.training_metadata = joblib.load(metadata_path)
+        model_files = {
+            'random_forest': 'random_forest_model.pkl',
+            'xgboost': 'xgboost_model.pkl',
+            'lightgbm': 'lightgbm_model.pkl',
+            'gradient_boosting': 'gradient_boosting_model.pkl',
+        }
+        allowed_models = self.training_metadata.get('available_models')
+        if allowed_models:
+            model_files = {k: v for k, v in model_files.items() if k in allowed_models}
+        for name, filename in model_files.items():
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    self.models[name] = joblib.load(path)
+                except Exception as exc:
+                    print(f'Failed to load model {name}: {exc}')
+        for filename, attr in [('feature_names.pkl', 'feature_names'), ('selected_features.pkl', 'selected_features')]:
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    setattr(self, attr, joblib.load(path))
+                except Exception as exc:
+                    print(f'Failed to load artifact {filename}: {exc}')
+
    def get_feature_importance(self, model_type='random_forest'):
        self._ensure_models_loaded()
-        
        if model_type not in self.models:
-            if self.models:
-                model_type = list(self.models.keys())[0]
-            else:
-                return self._get_default_importance()
-        
+            model_type = next(iter(self.models), 'default')
+        if model_type == 'default':
+            return self._get_default_importance()
        model = self.models[model_type]
-        
-        try:
-            if hasattr(model, 'feature_importances_'):
-                importances = model.feature_importances_
-            else:
-                return self._get_default_importance()
-            
-            feature_names = self.feature_names or [f'feature_{i}' for i in range(len(importances))]
-            
-            if len(feature_names) != len(importances):
-                feature_names = [f'feature_{i}' for i in range(len(importances))]
-            
-            feature_importance = list(zip(feature_names, importances))
-            feature_importance.sort(key=lambda x: x[1], reverse=True)
-            
-            features = []
-            for i, (name, imp) in enumerate(feature_importance[:15]):
-                features.append({
+        if not hasattr(model, 'feature_importances_'):
+            return self._get_default_importance()
+
+        importances = model.feature_importances_
+        feature_names = self.selected_features or self.feature_names or []
+        if len(feature_names) != len(importances):
+            feature_names = [f'feature_{idx}' for idx in range(len(importances))]
+        ranked = sorted(zip(feature_names, importances), key=lambda item: item[1], reverse=True)[:15]
+        return {
+            'model_type': model_type,
+            'features': [
+                {
                    'name': name,
                    'name_cn': config.FEATURE_NAME_CN.get(name, name),
-                    'importance': round(float(imp), 4),
-                    'rank': i + 1
-                })
-            
-            return {
-                'model_type': model_type,
-                'features': features
-            }
-        except Exception as e:
-            print(f"Error getting feature importance: {e}")
-            return self._get_default_importance()
-    
+                    'importance': round(float(importance), 4),
+                    'rank': idx + 1,
+                }
+                for idx, (name, importance) in enumerate(ranked)
+            ],
+        }
+
    def _get_default_importance(self):
-        default_features = [
-            ('Reason for absence', 0.25),
-            ('Transportation expense', 0.12),
-            ('Distance from Residence to Work', 0.10),
-            ('Service time', 0.08),
-            ('Age', 0.07),
-            ('Work load Average/day', 0.06),
-            ('Body mass index', 0.05),
-            ('Social drinker', 0.04),
-            ('Hit target', 0.03),
-            ('Son', 0.03),
-            ('Pet', 0.02),
-            ('Education', 0.02),
-            ('Social smoker', 0.01)
+        defaults = [
+            ('加班通勤压力指数', 0.24),
+            ('健康风险指数', 0.18),
+            ('请假类型', 0.12),
+            ('通勤时长分钟', 0.1),
+            ('月均加班时长', 0.08),
+            ('近90天缺勤次数', 0.07),
+            ('心理压力等级', 0.06),
+            ('家庭负担指数', 0.05),
        ]
-        
-        features = []
-        for i, (name, imp) in enumerate(default_features):
-            features.append({
-                'name': name,
-                'name_cn': config.FEATURE_NAME_CN.get(name, name),
-                'importance': imp,
-                'rank': i + 1
-            })
-        
        return {
            'model_type': 'default',
-            'features': features
+            'features': [
+                {
+                    'name': name,
+                    'name_cn': config.FEATURE_NAME_CN.get(name, name),
+                    'importance': importance,
+                    'rank': idx + 1,
+                }
+                for idx, (name, importance) in enumerate(defaults)
+            ],
        }
-    
+
    def get_correlation(self):
        return get_correlation_for_heatmap()
-    
+
    def get_group_comparison(self, dimension):
-        valid_dimensions = ['drinker', 'smoker', 'education', 'children', 'pet']
-        
-        if dimension not in valid_dimensions:
-            raise ValueError(f"Invalid dimension: {dimension}. Must be one of {valid_dimensions}")
-        
        return group_comparison(dimension)


@@ -11,7 +11,7 @@ class ClusterService:
    def get_cluster_profile(self, n_clusters=3):
        return self.analyzer.get_cluster_profile(n_clusters)
    
-    def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
+    def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长（小时）'):
        return self.analyzer.get_scatter_data(n_clusters, x_axis, y_axis)


@@ -1,6 +1,3 @@
-import pandas as pd
-import numpy as np
-
 import config
 from core.preprocessing import get_clean_data

@@ -8,154 +5,103 @@ from core.preprocessing import get_clean_data
 class DataService:
    def __init__(self):
        self._df = None
-    
+
    @property
    def df(self):
        if self._df is None:
            self._df = get_clean_data()
        return self._df
-    
+
    def get_basic_stats(self):
        df = self.df
-        
        total_records = len(df)
-        total_employees = df['ID'].nunique()
-        total_absent_hours = df['Absenteeism time in hours'].sum()
-        avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
-        max_absent_hours = int(df['Absenteeism time in hours'].max())
-        min_absent_hours = int(df['Absenteeism time in hours'].min())
-        
-        high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
-        high_risk_ratio = round(high_risk_count / total_records, 4)
-        
+        total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
+        avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
+        max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
+        min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
+        high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
        return {
            'total_records': total_records,
            'total_employees': total_employees,
-            'total_absent_hours': int(total_absent_hours),
            'avg_absent_hours': avg_absent_hours,
            'max_absent_hours': max_absent_hours,
            'min_absent_hours': min_absent_hours,
-            'high_risk_ratio': high_risk_ratio
+            'high_risk_ratio': round(high_risk_count / total_records, 4),
+            'industries_covered': int(df['所属行业'].nunique()),
        }
-    
+
    def get_monthly_trend(self):
        df = self.df
-        
-        monthly = df.groupby('Month of absence').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
-        
-        months = ['1月', '2月', '3月', '4月', '5月', '6月', 
-                  '7月', '8月', '9月', '10月', '11月', '12月']
-        
-        result = {
-            'months': months,
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for i in range(1, 13):
-            row = monthly[monthly['month'] == i]
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+        result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for month in range(1, 13):
+            row = monthly[monthly['month'] == month]
+            result['months'].append(f'{month}月')
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_weekday_distribution(self):
        df = self.df
-        
-        weekday = df.groupby('Day of the week').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
-        
-        result = {
-            'weekdays': [],
-            'weekday_codes': [],
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for code in [2, 3, 4, 5, 6]:
+        result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for code in range(1, 8):
            row = weekday[weekday['weekday'] == code]
            result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
            result['weekday_codes'].append(code)
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_reason_distribution(self):
        df = self.df
-        
-        reason = df.groupby('Reason for absence').agg({
-            'Absenteeism time in hours': 'count'
-        }).reset_index()
-        
-        reason.columns = ['code', 'count']
+        reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
+        reason.columns = ['name', 'count']
        reason = reason.sort_values('count', ascending=False)
-        
        total = reason['count'].sum()
-        
-        result = {
-            'reasons': []
+        return {
+            'reasons': [
+                {
+                    'name': row['name'],
+                    'count': int(row['count']),
+                    'percentage': round(float(row['count']) / total * 100, 1),
+                }
+                for _, row in reason.iterrows()
+            ]
        }
-        
-        for _, row in reason.iterrows():
-            code = int(row['code'])
-            result['reasons'].append({
-                'code': code,
-                'name': config.REASON_NAMES.get(code, f'原因{code}'),
-                'count': int(row['count']),
-                'percentage': round(row['count'] / total * 100, 1)
-            })
-        
-        return result
-    
+
    def get_season_distribution(self):
        df = self.df
-        
-        season = df.groupby('Seasons').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
-        
        total_records = season['record_count'].sum()
-        
-        result = {
-            'seasons': []
-        }
-        
+        result = {'seasons': []}
        for code in [1, 2, 3, 4]:
            row = season[season['season'] == code]
-            if len(row) > 0:
-                result['seasons'].append({
-                    'code': int(code),
-                    'name': config.SEASON_NAMES.get(code, f'季节{code}'),
-                    'total_hours': int(row['total_hours'].values[0]),
-                    'avg_hours': round(float(row['avg_hours'].values[0]), 2),
-                    'record_count': int(row['record_count'].values[0]),
-                    'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
-                })
-        
+            if not len(row):
+                continue
+            result['seasons'].append({
+                'code': code,
+                'name': config.SEASON_NAMES.get(code, f'季节{code}'),
+                'total_hours': round(float(row['total_hours'].values[0]), 1),
+                'avg_hours': round(float(row['avg_hours'].values[0]), 2),
+                'record_count': int(row['record_count'].values[0]),
+                'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
+            })
        return result


@@ -1,41 +1,25 @@
 import os
-import numpy as np
+
 import joblib
+import numpy as np

 import config
+from core.model_features import (
+    align_feature_frame,
+    apply_label_encoders,
+    build_prediction_dataframe,
+    engineer_features,
+    to_float_array,
+)


 MODEL_INFO = {
-    'random_forest': {
-        'name': 'random_forest',
-        'name_cn': '随机森林',
-        'description': '基于决策树的集成学习算法'
-    },
-    'xgboost': {
-        'name': 'xgboost',
-        'name_cn': 'XGBoost',
-        'description': '高效的梯度提升算法'
-    },
-    'lightgbm': {
-        'name': 'lightgbm',
-        'name_cn': 'LightGBM',
-        'description': '微软轻量级梯度提升框架'
-    },
-    'gradient_boosting': {
-        'name': 'gradient_boosting',
-        'name_cn': 'GBDT',
-        'description': '梯度提升决策树'
-    },
-    'extra_trees': {
-        'name': 'extra_trees',
-        'name_cn': '极端随机树',
-        'description': '随机森林的变体，随机性更强'
-    },
-    'stacking': {
-        'name': 'stacking',
-        'name_cn': 'Stacking集成',
-        'description': '多层堆叠集成学习'
-    }
+    'random_forest': {'name': 'random_forest', 'name_cn': '随机森林', 'description': '稳健的树模型集成'},
+    'xgboost': {'name': 'xgboost', 'name_cn': 'XGBoost', 'description': '梯度提升树模型'},
+    'lightgbm': {'name': 'lightgbm', 'name_cn': 'LightGBM', 'description': '轻量级梯度提升树'},
+    'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': 'GBDT', 'description': '梯度提升决策树'},
+    'extra_trees': {'name': 'extra_trees', 'name_cn': '极端随机树', 'description': '高随机性的树模型'},
+    'stacking': {'name': 'stacking', 'name_cn': 'Stacking集成', 'description': '多模型融合'},
 }


@@ -47,326 +31,172 @@ class PredictService:
        self.selected_features = None
        self.label_encoders = {}
        self.model_metrics = {}
+        self.training_metadata = {}
        self.default_model = 'random_forest'
-    
+
    def _ensure_models_loaded(self):
        if not self.models:
            self.load_models()
-    
+
    def load_models(self):
+        metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
+        if os.path.exists(metadata_path):
+            self.training_metadata = joblib.load(metadata_path)
+
        model_files = {
            'random_forest': 'random_forest_model.pkl',
            'xgboost': 'xgboost_model.pkl',
            'lightgbm': 'lightgbm_model.pkl',
            'gradient_boosting': 'gradient_boosting_model.pkl',
            'extra_trees': 'extra_trees_model.pkl',
-            'stacking': 'stacking_model.pkl'
+            'stacking': 'stacking_model.pkl',
        }
-        
+        allowed_models = self.training_metadata.get('available_models')
+        if allowed_models:
+            model_files = {k: v for k, v in model_files.items() if k in allowed_models}
+
        for name, filename in model_files.items():
-            model_path = os.path.join(config.MODELS_DIR, filename)
-            if os.path.exists(model_path):
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
                try:
-                    self.models[name] = joblib.load(model_path)
-                    print(f"Loaded {name} model")
-                except Exception as e:
-                    print(f"Failed to load {name}: {e}")
-        
+                    self.models[name] = joblib.load(path)
+                except Exception as exc:
+                    print(f'Failed to load model {name}: {exc}')
+
        if os.path.exists(config.SCALER_PATH):
            self.scaler = joblib.load(config.SCALER_PATH)
-        
-        feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
-        if os.path.exists(feature_names_path):
-            self.feature_names = joblib.load(feature_names_path)
-        
-        selected_features_path = os.path.join(config.MODELS_DIR, 'selected_features.pkl')
-        if os.path.exists(selected_features_path):
-            self.selected_features = joblib.load(selected_features_path)
-        
-        label_encoders_path = os.path.join(config.MODELS_DIR, 'label_encoders.pkl')
-        if os.path.exists(label_encoders_path):
-            self.label_encoders = joblib.load(label_encoders_path)
-        
-        metrics_path = os.path.join(config.MODELS_DIR, 'model_metrics.pkl')
-        if os.path.exists(metrics_path):
-            self.model_metrics = joblib.load(metrics_path)
-        
-        if self.model_metrics:
-            valid_metrics = {k: v for k, v in self.model_metrics.items() if k in self.models}
-            if valid_metrics:
-                best_model = max(valid_metrics.items(), key=lambda x: x[1]['r2'])
-                self.default_model = best_model[0]
-    
+        for filename, attr in [
+            ('feature_names.pkl', 'feature_names'),
+            ('selected_features.pkl', 'selected_features'),
+            ('label_encoders.pkl', 'label_encoders'),
+            ('model_metrics.pkl', 'model_metrics'),
+        ]:
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    setattr(self, attr, joblib.load(path))
+                except Exception as exc:
+                    print(f'Failed to load artifact {filename}: {exc}')
+
+        valid_metrics = {key: value for key, value in self.model_metrics.items() if key in self.models}
+        if valid_metrics:
+            self.default_model = max(valid_metrics.items(), key=lambda item: item[1]['r2'])[0]
+
    def get_available_models(self):
        self._ensure_models_loaded()
-        
        models = []
        for name in self.models.keys():
-            info = MODEL_INFO.get(name, {
-                'name': name,
-                'name_cn': name,
-                'description': ''
-            }).copy()
+            info = MODEL_INFO.get(name, {'name': name, 'name_cn': name, 'description': ''}).copy()
            info['is_available'] = True
-            info['is_default'] = (name == self.default_model)
-            
-            if name in self.model_metrics:
-                info['metrics'] = self.model_metrics[name]
-            else:
-                info['metrics'] = {'r2': 0, 'rmse': 0, 'mae': 0}
-            
+            info['is_default'] = name == self.default_model
+            info['metrics'] = self.model_metrics.get(name, {'r2': 0, 'rmse': 0, 'mae': 0})
            models.append(info)
-        
-        models.sort(key=lambda x: x['metrics']['r2'], reverse=True)
-        
+        models.sort(key=lambda item: item['metrics']['r2'], reverse=True)
        return models
-    
+
    def predict_single(self, data, model_type=None):
        self._ensure_models_loaded()
-        
-        if model_type is None:
-            model_type = self.default_model
-        
+        model_type = model_type or self.default_model
        if model_type not in self.models:
-            available = list(self.models.keys())
-            if available:
-                model_type = available[0]
-            else:
+            fallback = next(iter(self.models), None)
+            if fallback is None:
                return self._get_default_prediction(data)
-        
-        model = self.models[model_type]
-        
+            model_type = fallback
        if self.scaler is None or self.feature_names is None:
            return self._get_default_prediction(data)
-        
+
        features = self._prepare_features(data)
-        
        try:
-            predicted_hours = model.predict([features])[0]
-            predicted_hours = max(0, float(predicted_hours))
-        except Exception as e:
-            print(f"Prediction error: {e}")
+            predicted_hours = self.models[model_type].predict([features])[0]
+            predicted_hours = self._inverse_transform_prediction(predicted_hours)
+            predicted_hours = max(0.5, float(predicted_hours))
+        except Exception:
            return self._get_default_prediction(data)
-        
+
        risk_level, risk_label = self._get_risk_level(predicted_hours)
-        
-        confidence = 0.85
-        if model_type in self.model_metrics:
-            confidence = max(0.5, self.model_metrics[model_type].get('r2', 0.85))
-        
+        confidence = max(0.5, self.model_metrics.get(model_type, {}).get('r2', 0.82))
        return {
            'predicted_hours': round(predicted_hours, 2),
            'risk_level': risk_level,
            'risk_label': risk_label,
            'confidence': round(confidence, 2),
            'model_used': model_type,
-            'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type)
+            'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type),
        }
-    
+
    def predict_compare(self, data):
        self._ensure_models_loaded()
-        
        results = []
-        
        for name in self.models.keys():
-            try:
-                result = self.predict_single(data, name)
-                result['model'] = name
-                result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
-                
-                if name in self.model_metrics:
-                    result['r2'] = self.model_metrics[name]['r2']
-                else:
-                    result['r2'] = 0
-                
-                results.append(result)
-            except Exception as e:
-                print(f"Compare error for {name}: {e}")
-        
-        results.sort(key=lambda x: x.get('r2', 0), reverse=True)
-        
+            result = self.predict_single(data, name)
+            result['model'] = name
+            result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
+            result['r2'] = self.model_metrics.get(name, {}).get('r2', 0)
+            results.append(result)
+        results.sort(key=lambda item: item.get('r2', 0), reverse=True)
        if results:
            results[0]['recommended'] = True
-        
        return results
-    
+
    def _prepare_features(self, data):
-        feature_map = {
-            'Reason for absence': data.get('reason_for_absence', 23),
-            'Month of absence': data.get('month_of_absence', 7),
-            'Day of the week': data.get('day_of_week', 3),
-            'Seasons': data.get('seasons', 1),
-            'Transportation expense': data.get('transportation_expense', 200),
-            'Distance from Residence to Work': data.get('distance', 20),
-            'Service time': data.get('service_time', 5),
-            'Age': data.get('age', 30),
-            'Work load Average/day': data.get('work_load', 250),
-            'Hit target': data.get('hit_target', 95),
-            'Disciplinary failure': data.get('disciplinary_failure', 0),
-            'Education': data.get('education', 1),
-            'Son': data.get('son', 0),
-            'Social drinker': data.get('social_drinker', 0),
-            'Social smoker': data.get('social_smoker', 0),
-            'Pet': data.get('pet', 0),
-            'Body mass index': data.get('bmi', 25)
-        }
-        
-        age = feature_map['Age']
-        service_time = feature_map['Service time']
-        work_load = feature_map['Work load Average/day']
-        distance = feature_map['Distance from Residence to Work']
-        expense = feature_map['Transportation expense']
-        bmi = feature_map['Body mass index']
-        son = feature_map['Son']
-        pet = feature_map['Pet']
-        social_drinker = feature_map['Social drinker']
-        social_smoker = feature_map['Social smoker']
-        hit_target = feature_map['Hit target']
-        seasons = feature_map['Seasons']
-        day_of_week = feature_map['Day of the week']
-        
-        derived_features = {
-            'workload_per_age': work_load / (age + 1),
-            'expense_per_distance': expense / (distance + 1),
-            'age_service_ratio': age / (service_time + 1),
-            'has_children': 1 if son > 0 else 0,
-            'has_pet': 1 if pet > 0 else 0,
-            'family_responsibility': son + pet,
-            'health_risk': 1 if (social_drinker == 1 or social_smoker == 1 or bmi > 30) else 0,
-            'lifestyle_risk': int(social_drinker) + int(social_smoker),
-            'age_group': 1 if age <= 30 else (2 if age <= 40 else (3 if age <= 50 else 4)),
-            'service_group': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 20 else 4)),
-            'bmi_category': 1 if bmi <= 18.5 else (2 if bmi <= 25 else (3 if bmi <= 30 else 4)),
-            'workload_category': 1 if work_load <= 200 else (2 if work_load <= 250 else (3 if work_load <= 300 else 4)),
-            'commute_category': 1 if distance <= 10 else (2 if distance <= 20 else (3 if distance <= 50 else 4)),
-            'seasonal_risk': 1 if seasons in [1, 3] else 0,
-            'weekday_risk': 1 if day_of_week in [2, 6] else 0,
-            'hit_target_ratio': hit_target / 100,
-            'experience_level': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 15 else 4)),
-            'age_workload_interaction': age * work_load / 10000,
-            'service_bmi_interaction': service_time * bmi / 100
-        }
-        
-        all_features = {**feature_map, **derived_features}
-        
-        features = []
-        for fname in self.feature_names:
-            if fname in all_features:
-                val = all_features[fname]
-                
-                if fname in self.label_encoders:
-                    try:
-                        val = self.label_encoders[fname].transform([str(val)])[0]
-                    except:
-                        val = 0
-                
-                features.append(float(val))
-            else:
-                features.append(0.0)
-        
-        features = np.array(features).reshape(1, -1)
-        features = self.scaler.transform(features)[0]
-        
+        X_df = build_prediction_dataframe(data)
+        X_df = engineer_features(X_df)
+        X_df = apply_label_encoders(X_df, self.label_encoders)
+        X_df = align_feature_frame(X_df, self.feature_names)
+        features = self.scaler.transform(to_float_array(X_df))[0]
        if self.selected_features:
-            selected_indices = []
-            for sf in self.selected_features:
-                if sf in self.feature_names:
-                    selected_indices.append(self.feature_names.index(sf))
+            selected_indices = [self.feature_names.index(name) for name in self.selected_features if name in self.feature_names]
            if selected_indices:
                features = features[selected_indices]
-        
        return features
-    
+
+    def _inverse_transform_prediction(self, prediction):
+        if self.training_metadata.get('target_transform') == 'log1p':
+            return float(np.expm1(prediction))
+        return float(prediction)
+
    def _get_risk_level(self, hours):
        if hours < 4:
            return 'low', '低风险'
-        elif hours <= 8:
+        if hours <= 8:
            return 'medium', '中风险'
-        else:
-            return 'high', '高风险'
-    
+        return 'high', '高风险'
+
    def _get_default_prediction(self, data):
-        base_hours = 5.0
-        
-        expense = data.get('transportation_expense', 200)
-        if expense > 300:
-            base_hours += 1.0
-        elif expense < 150:
+        base_hours = 3.8
+        base_hours += min(float(data.get('monthly_overtime_hours', 24)) / 20, 3.0)
+        base_hours += min(float(data.get('commute_minutes', 40)) / 50, 2.0)
+        base_hours += 1.6 if int(data.get('is_night_shift', 0)) == 1 else 0
+        base_hours += 1.8 if int(data.get('chronic_disease_flag', 0)) == 1 else 0
+        base_hours += 0.9 if int(data.get('near_holiday_flag', 0)) == 1 else 0
+        base_hours += 0.8 if int(data.get('medical_certificate_flag', 0)) == 1 else 0
+        base_hours += 0.5 * int(data.get('children_count', 0))
+        if data.get('leave_type') in ['病假', '工伤假', '婚假', '丧假']:
+            base_hours += 2.5
+        if data.get('stress_level') == '高':
+            base_hours += 0.9
+        if data.get('performance_level') == 'A':
            base_hours -= 0.5
-        
-        distance = data.get('distance', 20)
-        if distance > 40:
-            base_hours += 1.5
-        elif distance > 25:
-            base_hours += 0.8
-        
-        service_time = data.get('service_time', 5)
-        if service_time < 3:
-            base_hours += 0.5
-        elif service_time > 15:
-            base_hours -= 0.5
-        
-        age = data.get('age', 30)
-        if age > 50:
-            base_hours += 0.5
-        elif age < 25:
-            base_hours += 0.3
-        
-        work_load = data.get('work_load', 250)
-        if work_load > 300:
-            base_hours += 1.5
-        elif work_load > 260:
-            base_hours += 0.5
-        
-        bmi = data.get('bmi', 25)
-        if bmi > 30:
-            base_hours += 0.8
-        elif bmi < 20:
-            base_hours += 0.3
-        
-        if data.get('social_drinker', 0) == 1:
-            base_hours += 0.8
-        if data.get('social_smoker', 0) == 1:
-            base_hours += 0.5
-        
-        son = data.get('son', 0)
-        if son > 0:
-            base_hours += 0.3 * son
-        
-        pet = data.get('pet', 0)
-        if pet > 0:
-            base_hours -= 0.1 * pet
-        
-        hit_target = data.get('hit_target', 95)
-        if hit_target < 90:
-            base_hours += 0.5
-        
-        base_hours = max(0.5, base_hours)
-        
        risk_level, risk_label = self._get_risk_level(base_hours)
-        
        return {
-            'predicted_hours': round(base_hours, 2),
+            'predicted_hours': round(max(0.5, base_hours), 2),
            'risk_level': risk_level,
            'risk_label': risk_label,
-            'confidence': 0.75,
+            'confidence': 0.72,
            'model_used': 'default',
-            'model_name_cn': '默认规则'
+            'model_name_cn': '默认规则',
        }
-    
+
    def get_model_info(self):
        self._ensure_models_loaded()
-        
-        models = self.get_available_models()
-        
        return {
-            'models': models,
+            'models': self.get_available_models(),
            'training_info': {
-                'train_samples': 2884,
-                'test_samples': 722,
-                'feature_count': len(self.feature_names) if self.feature_names else 20,
-                'training_date': '2026-03-08'
-            }
+                'train_samples': self.training_metadata.get('train_samples', 0),
+                'test_samples': self.training_metadata.get('test_samples', 0),
+                'feature_count': self.training_metadata.get('feature_count_after_selection', 0),
+                'training_date': self.training_metadata.get('training_date', ''),
+            },
        }