- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
import os
|
|
|
|
import joblib
|
|
import pandas as pd
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
import config
|
|
from core.generate_dataset import ensure_dataset
|
|
|
|
|
|
class DataPreprocessor:
|
|
def __init__(self):
|
|
self.scaler = StandardScaler()
|
|
self.is_fitted = False
|
|
self.feature_names = None
|
|
|
|
def load_raw_data(self):
|
|
ensure_dataset()
|
|
df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR)
|
|
df.columns = df.columns.str.strip()
|
|
return df
|
|
|
|
def clean_data(self, df):
|
|
df = df.copy()
|
|
df = df.drop_duplicates()
|
|
|
|
for col in df.columns:
|
|
if df[col].isnull().sum() == 0:
|
|
continue
|
|
if pd.api.types.is_numeric_dtype(df[col]):
|
|
df[col] = df[col].fillna(df[col].median())
|
|
else:
|
|
df[col] = df[col].fillna(df[col].mode()[0])
|
|
|
|
return df
|
|
|
|
def fit_transform(self, df):
|
|
df = self.clean_data(df)
|
|
if config.TARGET_COLUMN in df.columns:
|
|
y = df[config.TARGET_COLUMN].values
|
|
feature_df = df.drop(columns=[config.TARGET_COLUMN])
|
|
else:
|
|
y = None
|
|
feature_df = df
|
|
|
|
self.feature_names = list(feature_df.columns)
|
|
X = self.scaler.fit_transform(feature_df.values)
|
|
self.is_fitted = True
|
|
return X, y
|
|
|
|
def transform(self, df):
|
|
if not self.is_fitted:
|
|
raise ValueError("Preprocessor has not been fitted yet.")
|
|
|
|
df = self.clean_data(df)
|
|
if config.TARGET_COLUMN in df.columns:
|
|
feature_df = df.drop(columns=[config.TARGET_COLUMN])
|
|
else:
|
|
feature_df = df
|
|
return self.scaler.transform(feature_df.values)
|
|
|
|
def save_preprocessor(self):
|
|
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
|
joblib.dump(self.scaler, config.SCALER_PATH)
|
|
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
|
|
|
def load_preprocessor(self):
|
|
self.scaler = joblib.load(config.SCALER_PATH)
|
|
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
|
if os.path.exists(feature_names_path):
|
|
self.feature_names = joblib.load(feature_names_path)
|
|
self.is_fitted = True
|
|
|
|
|
|
def get_clean_data():
|
|
preprocessor = DataPreprocessor()
|
|
df = preprocessor.load_raw_data()
|
|
return preprocessor.clean_data(df)
|
|
|
|
|
|
def save_clean_data():
|
|
preprocessor = DataPreprocessor()
|
|
df = preprocessor.load_raw_data()
|
|
df = preprocessor.clean_data(df)
|
|
os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
|
|
df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',')
|
|
return df
|
|
|
|
|
|
if __name__ == '__main__':
|
|
data = save_clean_data()
|
|
print(f"Clean data saved. Shape: {data.shape}")
|