feat: 初始化员工缺勤分析系统项目
搭建完整的前后端分离架构,实现数据概览、预测分析、聚类分析等核心功能模块 详细版: feat: 初始化员工缺勤分析系统项目 - 后端:基于 Flask 搭建 RESTful API,包含数据概览、特征分析、预测模型、聚类分析四大模块 - 前端:基于 Vue.js 构建单页应用,实现 Dashboard、预测、聚类、因子分析等页面 - 模型:集成随机森林、XGBoost、LightGBM、Stacking 等多种机器学习模型 - 文档:完成需求规格说明、系统架构设计、接口设计、数据设计、UI原型设计等文档
This commit is contained in:
105
backend/core/preprocessing.py
Normal file
105
backend/core/preprocessing.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import joblib
|
||||
import os
|
||||
|
||||
import config
|
||||
|
||||
|
||||
class DataPreprocessor:
|
||||
def __init__(self):
|
||||
self.scaler = StandardScaler()
|
||||
self.is_fitted = False
|
||||
self.feature_names = None
|
||||
|
||||
def load_raw_data(self):
|
||||
df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR)
|
||||
df.columns = df.columns.str.strip()
|
||||
return df
|
||||
|
||||
def clean_data(self, df):
|
||||
df = df.copy()
|
||||
|
||||
df = df.drop_duplicates()
|
||||
|
||||
for col in df.columns:
|
||||
if df[col].isnull().sum() > 0:
|
||||
if df[col].dtype in ['int64', 'float64']:
|
||||
df[col].fillna(df[col].median(), inplace=True)
|
||||
else:
|
||||
df[col].fillna(df[col].mode()[0], inplace=True)
|
||||
|
||||
return df
|
||||
|
||||
def fit_transform(self, df):
|
||||
df = self.clean_data(df)
|
||||
|
||||
if 'Absenteeism time in hours' in df.columns:
|
||||
y = df['Absenteeism time in hours'].values
|
||||
feature_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
else:
|
||||
y = None
|
||||
feature_df = df
|
||||
|
||||
self.feature_names = list(feature_df.columns)
|
||||
|
||||
X = feature_df.values
|
||||
|
||||
X = self.scaler.fit_transform(X)
|
||||
|
||||
self.is_fitted = True
|
||||
|
||||
return X, y
|
||||
|
||||
def transform(self, df):
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Preprocessor has not been fitted yet.")
|
||||
|
||||
df = self.clean_data(df)
|
||||
|
||||
if 'Absenteeism time in hours' in df.columns:
|
||||
feature_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
else:
|
||||
feature_df = df
|
||||
|
||||
X = feature_df.values
|
||||
X = self.scaler.transform(X)
|
||||
|
||||
return X
|
||||
|
||||
def save_preprocessor(self):
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
joblib.dump(self.scaler, config.SCALER_PATH)
|
||||
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
||||
|
||||
def load_preprocessor(self):
|
||||
self.scaler = joblib.load(config.SCALER_PATH)
|
||||
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
||||
if os.path.exists(feature_names_path):
|
||||
self.feature_names = joblib.load(feature_names_path)
|
||||
self.is_fitted = True
|
||||
|
||||
|
||||
def get_clean_data():
|
||||
preprocessor = DataPreprocessor()
|
||||
df = preprocessor.load_raw_data()
|
||||
df = preprocessor.clean_data(df)
|
||||
return df
|
||||
|
||||
|
||||
def save_clean_data():
|
||||
preprocessor = DataPreprocessor()
|
||||
df = preprocessor.load_raw_data()
|
||||
df = preprocessor.clean_data(df)
|
||||
|
||||
os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
|
||||
df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = save_clean_data()
|
||||
print(f"Clean data saved. Shape: {df.shape}")
|
||||
print(df.head())
|
||||
Reference in New Issue
Block a user