feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据

- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py)，覆盖7个行业、180家企业、2600名员工 - 重构 config.py，更新特征字段为中文名称，调整目标列、员工ID、行业类型等配置 - 重构 clustering.py，简化聚类逻辑，更新聚类特征和群体命名（高压通勤型、健康波动型等） - 重构 feature_mining.py，更新相关性分析和群体比较维度（按行业、班次、婚姻状态等） - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数（model: random_forest, dimension: industry） - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00
parent a39d8b2fd2
commit e63267cef6
39 changed files with 15731 additions and 5648 deletions
--- a/backend/core/feature_mining.py
+++ b/backend/core/feature_mining.py
@@ -1,4 +1,3 @@
-import pandas as pd
 import numpy as np

 import config
@@ -7,145 +6,67 @@ from core.preprocessing import get_clean_data

 def calculate_correlation():
    df = get_clean_data()
-    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    
-    if 'ID' in numeric_cols:
-        numeric_cols.remove('ID')
-    
-    corr_matrix = df[numeric_cols].corr()
-    
-    return corr_matrix
+    for candidate in [config.EMPLOYEE_ID_COLUMN]:
+        if candidate in numeric_cols:
+            numeric_cols.remove(candidate)
+    return df[numeric_cols].corr()


 def get_correlation_for_heatmap():
    corr_matrix = calculate_correlation()
-    
    key_features = [
-        'Age',
-        'Service time',
-        'Distance from Residence to Work',
-        'Work load Average/day ',
-        'Body mass index',
-        'Absenteeism time in hours'
+        '月均加班时长',
+        '通勤时长分钟',
+        '近90天缺勤次数',
+        'BMI',
+        '近30天睡眠时长均值',
+        '缺勤时长（小时）',
    ]
-    
    key_features = [f for f in key_features if f in corr_matrix.columns]
-    
    sub_matrix = corr_matrix.loc[key_features, key_features]
-    
-    result = {
+    return {
        'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features],
-        'matrix': sub_matrix.values.round(2).tolist()
+        'matrix': sub_matrix.values.round(2).tolist(),
    }
-    
-    return result
-
-
-def calculate_feature_importance(model, feature_names):
-    if hasattr(model, 'feature_importances_'):
-        importance = model.feature_importances_
-    else:
-        raise ValueError("Model does not have feature_importances_ attribute")
-    
-    importance_dict = dict(zip(feature_names, importance))
-    
-    sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
-    
-    return sorted_importance
-
-
-def get_feature_importance_from_model(model_path, feature_names):
-    import joblib
-    
-    model = joblib.load(model_path)
-    return calculate_feature_importance(model, feature_names)


 def group_comparison(dimension):
    df = get_clean_data()
-    
    dimension_map = {
-        'drinker': ('Social drinker', {0: '不饮酒', 1: '饮酒'}),
-        'smoker': ('Social smoker', {0: '不吸烟', 1: '吸烟'}),
-        'education': ('Education', {1: '高中', 2: '本科', 3: '研究生', 4: '博士'}),
-        'children': ('Son', {0: '无子女'}, lambda x: x > 0, '有子女'),
-        'pet': ('Pet', {0: '无宠物'}, lambda x: x > 0, '有宠物')
+        'industry': ('所属行业', None, '所属行业'),
+        'shift_type': ('班次类型', None, '班次类型'),
+        'job_family': ('岗位序列', None, '岗位序列'),
+        'marital_status': ('婚姻状态', None, '婚姻状态'),
+        'chronic_disease': ('是否慢性病史', {0: '无慢性病史', 1: '有慢性病史'}, '慢性病史'),
    }
-    
    if dimension not in dimension_map:
        raise ValueError(f"Invalid dimension: {dimension}")
-    
-    col, value_map = dimension_map[dimension][0], dimension_map[dimension][1]
-    
-    if dimension in ['children', 'pet']:
-        threshold_fn = dimension_map[dimension][2]
-        other_label = dimension_map[dimension][3]
-        
-        groups = []
-        for val in [0]:
-            group_df = df[df[col] == val]
-            if len(group_df) > 0:
-                groups.append({
-                    'name': value_map.get(val, str(val)),
-                    'value': val,
-                    'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                    'count': len(group_df),
-                    'percentage': round(len(group_df) / len(df) * 100, 1)
-                })
-        
-        group_df = df[df[col].apply(threshold_fn)]
-        if len(group_df) > 0:
-            groups.append({
-                'name': other_label,
-                'value': 1,
-                'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                'count': len(group_df),
-                'percentage': round(len(group_df) / len(df) * 100, 1)
-            })
-    else:
-        groups = []
-        for val in sorted(df[col].unique()):
-            group_df = df[df[col] == val]
-            if len(group_df) > 0:
-                groups.append({
-                    'name': value_map.get(val, str(val)),
-                    'value': int(val),
-                    'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
-                    'count': len(group_df),
-                    'percentage': round(len(group_df) / len(df) * 100, 1)
-                })
-    
-    if len(groups) >= 2:
-        diff_value = abs(groups[0]['avg_hours'] - groups[1]['avg_hours'])
-        base = min(groups[0]['avg_hours'], groups[1]['avg_hours'])
-        diff_percentage = round(diff_value / base * 100, 1) if base > 0 else 0
-    else:
-        diff_value = 0
-        diff_percentage = 0
-    
+
+    column, value_map, dimension_name = dimension_map[dimension]
+    groups = []
+    for value in sorted(df[column].unique()):
+        group_df = df[df[column] == value]
+        groups.append({
+            'name': value_map.get(value, value) if value_map else str(value),
+            'value': int(value) if isinstance(value, (int, np.integer)) else str(value),
+            'avg_hours': round(group_df[config.TARGET_COLUMN].mean(), 2),
+            'count': int(len(group_df)),
+            'percentage': round(len(group_df) / len(df) * 100, 1),
+        })
+
+    groups.sort(key=lambda item: item['avg_hours'], reverse=True)
+    top = groups[0]['avg_hours'] if groups else 0
+    bottom = groups[-1]['avg_hours'] if len(groups) > 1 else 0
+    diff_value = round(top - bottom, 2)
+    diff_percentage = round(diff_value / bottom * 100, 1) if bottom else 0
+
    return {
        'dimension': dimension,
-        'dimension_name': {
-            'drinker': '饮酒习惯',
-            'smoker': '吸烟习惯',
-            'education': '学历',
-            'children': '子女',
-            'pet': '宠物'
-        }.get(dimension, dimension),
+        'dimension_name': dimension_name,
        'groups': groups,
        'difference': {
            'value': diff_value,
-            'percentage': diff_percentage
-        }
+            'percentage': diff_percentage,
+        },
    }
-
-
-if __name__ == '__main__':
-    print("Correlation matrix:")
-    corr = get_correlation_for_heatmap()
-    print(corr)
-    
-    print("\nGroup comparison (drinker):")
-    comp = group_comparison('drinker')
-    print(comp)