feat: 初始化员工缺勤分析系统项目

搭建完整的前后端分离架构，实现数据概览、预测分析、聚类分析等核心功能模块详细版： feat: 初始化员工缺勤分析系统项目 - 后端：基于 Flask 搭建 RESTful API，包含数据概览、特征分析、预测模型、聚类分析四大模块 - 前端：基于 Vue.js 构建单页应用，实现 Dashboard、预测、聚类、因子分析等页面 - 模型：集成随机森林、XGBoost、LightGBM、Stacking 等多种机器学习模型 - 文档：完成需求规格说明、系统架构设计、接口设计、数据设计、UI原型设计等文档
2026-03-08 14:48:26 +08:00
commit a39d8b2fd2
48 changed files with 9546 additions and 0 deletions
--- a/backend/core/preprocessing.py
+++ b/backend/core/preprocessing.py
@@ -0,0 +1,105 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+import joblib
+import os
+
+import config
+
+
+class DataPreprocessor:
+    def __init__(self):
+        self.scaler = StandardScaler()
+        self.is_fitted = False
+        self.feature_names = None
+        
+    def load_raw_data(self):
+        df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR)
+        df.columns = df.columns.str.strip()
+        return df
+    
+    def clean_data(self, df):
+        df = df.copy()
+        
+        df = df.drop_duplicates()
+        
+        for col in df.columns:
+            if df[col].isnull().sum() > 0:
+                if df[col].dtype in ['int64', 'float64']:
+                    df[col].fillna(df[col].median(), inplace=True)
+                else:
+                    df[col].fillna(df[col].mode()[0], inplace=True)
+        
+        return df
+    
+    def fit_transform(self, df):
+        df = self.clean_data(df)
+        
+        if 'Absenteeism time in hours' in df.columns:
+            y = df['Absenteeism time in hours'].values
+            feature_df = df.drop(columns=['Absenteeism time in hours'])
+        else:
+            y = None
+            feature_df = df
+        
+        self.feature_names = list(feature_df.columns)
+        
+        X = feature_df.values
+        
+        X = self.scaler.fit_transform(X)
+        
+        self.is_fitted = True
+        
+        return X, y
+    
+    def transform(self, df):
+        if not self.is_fitted:
+            raise ValueError("Preprocessor has not been fitted yet.")
+        
+        df = self.clean_data(df)
+        
+        if 'Absenteeism time in hours' in df.columns:
+            feature_df = df.drop(columns=['Absenteeism time in hours'])
+        else:
+            feature_df = df
+        
+        X = feature_df.values
+        X = self.scaler.transform(X)
+        
+        return X
+    
+    def save_preprocessor(self):
+        os.makedirs(config.MODELS_DIR, exist_ok=True)
+        joblib.dump(self.scaler, config.SCALER_PATH)
+        joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
+    
+    def load_preprocessor(self):
+        self.scaler = joblib.load(config.SCALER_PATH)
+        feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
+        if os.path.exists(feature_names_path):
+            self.feature_names = joblib.load(feature_names_path)
+        self.is_fitted = True
+
+
+def get_clean_data():
+    preprocessor = DataPreprocessor()
+    df = preprocessor.load_raw_data()
+    df = preprocessor.clean_data(df)
+    return df
+
+
+def save_clean_data():
+    preprocessor = DataPreprocessor()
+    df = preprocessor.load_raw_data()
+    df = preprocessor.clean_data(df)
+    
+    os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
+    df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',')
+    
+    return df
+
+
+if __name__ == '__main__':
+    df = save_clean_data()
+    print(f"Clean data saved. Shape: {df.shape}")
+    print(df.head())