feat: 初始化员工缺勤分析系统项目

搭建完整的前后端分离架构，实现数据概览、预测分析、聚类分析等核心功能模块详细版： feat: 初始化员工缺勤分析系统项目 - 后端：基于 Flask 搭建 RESTful API，包含数据概览、特征分析、预测模型、聚类分析四大模块 - 前端：基于 Vue.js 构建单页应用，实现 Dashboard、预测、聚类、因子分析等页面 - 模型：集成随机森林、XGBoost、LightGBM、Stacking 等多种机器学习模型 - 文档：完成需求规格说明、系统架构设计、接口设计、数据设计、UI原型设计等文档
2026-03-08 14:48:26 +08:00
commit a39d8b2fd2
48 changed files with 9546 additions and 0 deletions
--- a/backend/services/data_service.py
+++ b/backend/services/data_service.py
@@ -0,0 +1,162 @@
+import pandas as pd
+import numpy as np
+
+import config
+from core.preprocessing import get_clean_data
+
+
+class DataService:
+    def __init__(self):
+        self._df = None
+    
+    @property
+    def df(self):
+        if self._df is None:
+            self._df = get_clean_data()
+        return self._df
+    
+    def get_basic_stats(self):
+        df = self.df
+        
+        total_records = len(df)
+        total_employees = df['ID'].nunique()
+        total_absent_hours = df['Absenteeism time in hours'].sum()
+        avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
+        max_absent_hours = int(df['Absenteeism time in hours'].max())
+        min_absent_hours = int(df['Absenteeism time in hours'].min())
+        
+        high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
+        high_risk_ratio = round(high_risk_count / total_records, 4)
+        
+        return {
+            'total_records': total_records,
+            'total_employees': total_employees,
+            'total_absent_hours': int(total_absent_hours),
+            'avg_absent_hours': avg_absent_hours,
+            'max_absent_hours': max_absent_hours,
+            'min_absent_hours': min_absent_hours,
+            'high_risk_ratio': high_risk_ratio
+        }
+    
+    def get_monthly_trend(self):
+        df = self.df
+        
+        monthly = df.groupby('Month of absence').agg({
+            'Absenteeism time in hours': ['sum', 'mean', 'count']
+        }).reset_index()
+        
+        monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
+        
+        months = ['1月', '2月', '3月', '4月', '5月', '6月', 
+                  '7月', '8月', '9月', '10月', '11月', '12月']
+        
+        result = {
+            'months': months,
+            'total_hours': [],
+            'avg_hours': [],
+            'record_counts': []
+        }
+        
+        for i in range(1, 13):
+            row = monthly[monthly['month'] == i]
+            if len(row) > 0:
+                result['total_hours'].append(int(row['total_hours'].values[0]))
+                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
+                result['record_counts'].append(int(row['record_count'].values[0]))
+            else:
+                result['total_hours'].append(0)
+                result['avg_hours'].append(0)
+                result['record_counts'].append(0)
+        
+        return result
+    
+    def get_weekday_distribution(self):
+        df = self.df
+        
+        weekday = df.groupby('Day of the week').agg({
+            'Absenteeism time in hours': ['sum', 'mean', 'count']
+        }).reset_index()
+        
+        weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
+        
+        result = {
+            'weekdays': [],
+            'weekday_codes': [],
+            'total_hours': [],
+            'avg_hours': [],
+            'record_counts': []
+        }
+        
+        for code in [2, 3, 4, 5, 6]:
+            row = weekday[weekday['weekday'] == code]
+            result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
+            result['weekday_codes'].append(code)
+            if len(row) > 0:
+                result['total_hours'].append(int(row['total_hours'].values[0]))
+                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
+                result['record_counts'].append(int(row['record_count'].values[0]))
+            else:
+                result['total_hours'].append(0)
+                result['avg_hours'].append(0)
+                result['record_counts'].append(0)
+        
+        return result
+    
+    def get_reason_distribution(self):
+        df = self.df
+        
+        reason = df.groupby('Reason for absence').agg({
+            'Absenteeism time in hours': 'count'
+        }).reset_index()
+        
+        reason.columns = ['code', 'count']
+        reason = reason.sort_values('count', ascending=False)
+        
+        total = reason['count'].sum()
+        
+        result = {
+            'reasons': []
+        }
+        
+        for _, row in reason.iterrows():
+            code = int(row['code'])
+            result['reasons'].append({
+                'code': code,
+                'name': config.REASON_NAMES.get(code, f'原因{code}'),
+                'count': int(row['count']),
+                'percentage': round(row['count'] / total * 100, 1)
+            })
+        
+        return result
+    
+    def get_season_distribution(self):
+        df = self.df
+        
+        season = df.groupby('Seasons').agg({
+            'Absenteeism time in hours': ['sum', 'mean', 'count']
+        }).reset_index()
+        
+        season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
+        
+        total_records = season['record_count'].sum()
+        
+        result = {
+            'seasons': []
+        }
+        
+        for code in [1, 2, 3, 4]:
+            row = season[season['season'] == code]
+            if len(row) > 0:
+                result['seasons'].append({
+                    'code': int(code),
+                    'name': config.SEASON_NAMES.get(code, f'季节{code}'),
+                    'total_hours': int(row['total_hours'].values[0]),
+                    'avg_hours': round(float(row['avg_hours'].values[0]), 2),
+                    'record_count': int(row['record_count'].values[0]),
+                    'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
+                })
+        
+        return result
+
+
+data_service = DataService()