feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据

- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py)，覆盖7个行业、180家企业、2600名员工 - 重构 config.py，更新特征字段为中文名称，调整目标列、员工ID、行业类型等配置 - 重构 clustering.py，简化聚类逻辑，更新聚类特征和群体命名（高压通勤型、健康波动型等） - 重构 feature_mining.py，更新相关性分析和群体比较维度（按行业、班次、婚姻状态等） - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数（model: random_forest, dimension: industry） - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00
parent a39d8b2fd2
commit e63267cef6
39 changed files with 15731 additions and 5648 deletions
--- a/backend/services/data_service.py
+++ b/backend/services/data_service.py
@@ -1,6 +1,3 @@
-import pandas as pd
-import numpy as np
-
 import config
 from core.preprocessing import get_clean_data

@@ -8,154 +5,103 @@ from core.preprocessing import get_clean_data
 class DataService:
    def __init__(self):
        self._df = None
-    
+
    @property
    def df(self):
        if self._df is None:
            self._df = get_clean_data()
        return self._df
-    
+
    def get_basic_stats(self):
        df = self.df
-        
        total_records = len(df)
-        total_employees = df['ID'].nunique()
-        total_absent_hours = df['Absenteeism time in hours'].sum()
-        avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
-        max_absent_hours = int(df['Absenteeism time in hours'].max())
-        min_absent_hours = int(df['Absenteeism time in hours'].min())
-        
-        high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
-        high_risk_ratio = round(high_risk_count / total_records, 4)
-        
+        total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
+        avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
+        max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
+        min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
+        high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
        return {
            'total_records': total_records,
            'total_employees': total_employees,
-            'total_absent_hours': int(total_absent_hours),
            'avg_absent_hours': avg_absent_hours,
            'max_absent_hours': max_absent_hours,
            'min_absent_hours': min_absent_hours,
-            'high_risk_ratio': high_risk_ratio
+            'high_risk_ratio': round(high_risk_count / total_records, 4),
+            'industries_covered': int(df['所属行业'].nunique()),
        }
-    
+
    def get_monthly_trend(self):
        df = self.df
-        
-        monthly = df.groupby('Month of absence').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
-        
-        months = ['1月', '2月', '3月', '4月', '5月', '6月', 
-                  '7月', '8月', '9月', '10月', '11月', '12月']
-        
-        result = {
-            'months': months,
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for i in range(1, 13):
-            row = monthly[monthly['month'] == i]
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+        result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for month in range(1, 13):
+            row = monthly[monthly['month'] == month]
+            result['months'].append(f'{month}月')
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_weekday_distribution(self):
        df = self.df
-        
-        weekday = df.groupby('Day of the week').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
-        
-        result = {
-            'weekdays': [],
-            'weekday_codes': [],
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for code in [2, 3, 4, 5, 6]:
+        result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for code in range(1, 8):
            row = weekday[weekday['weekday'] == code]
            result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
            result['weekday_codes'].append(code)
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_reason_distribution(self):
        df = self.df
-        
-        reason = df.groupby('Reason for absence').agg({
-            'Absenteeism time in hours': 'count'
-        }).reset_index()
-        
-        reason.columns = ['code', 'count']
+        reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
+        reason.columns = ['name', 'count']
        reason = reason.sort_values('count', ascending=False)
-        
        total = reason['count'].sum()
-        
-        result = {
-            'reasons': []
+        return {
+            'reasons': [
+                {
+                    'name': row['name'],
+                    'count': int(row['count']),
+                    'percentage': round(float(row['count']) / total * 100, 1),
+                }
+                for _, row in reason.iterrows()
+            ]
        }
-        
-        for _, row in reason.iterrows():
-            code = int(row['code'])
-            result['reasons'].append({
-                'code': code,
-                'name': config.REASON_NAMES.get(code, f'原因{code}'),
-                'count': int(row['count']),
-                'percentage': round(row['count'] / total * 100, 1)
-            })
-        
-        return result
-    
+
    def get_season_distribution(self):
        df = self.df
-        
-        season = df.groupby('Seasons').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
-        
        total_records = season['record_count'].sum()
-        
-        result = {
-            'seasons': []
-        }
-        
+        result = {'seasons': []}
        for code in [1, 2, 3, 4]:
            row = season[season['season'] == code]
-            if len(row) > 0:
-                result['seasons'].append({
-                    'code': int(code),
-                    'name': config.SEASON_NAMES.get(code, f'季节{code}'),
-                    'total_hours': int(row['total_hours'].values[0]),
-                    'avg_hours': round(float(row['avg_hours'].values[0]), 2),
-                    'record_count': int(row['record_count'].values[0]),
-                    'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
-                })
-        
+            if not len(row):
+                continue
+            result['seasons'].append({
+                'code': code,
+                'name': config.SEASON_NAMES.get(code, f'季节{code}'),
+                'total_hours': round(float(row['total_hours'].values[0]), 1),
+                'avg_hours': round(float(row['avg_hours'].values[0]), 2),
+                'record_count': int(row['record_count'].values[0]),
+                'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
+            })
        return result