- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
301 lines
11 KiB
Python
301 lines
11 KiB
Python
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
|
|
import joblib
|
|
import numpy as np
|
|
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
|
|
from sklearn.feature_selection import SelectKBest, f_regression
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
|
from sklearn.preprocessing import RobustScaler
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
import config
|
|
from core.model_features import (
|
|
NUMERICAL_OUTLIER_COLUMNS,
|
|
ORDINAL_COLUMNS,
|
|
TARGET_COLUMN,
|
|
align_feature_frame,
|
|
apply_label_encoders,
|
|
apply_outlier_bounds,
|
|
engineer_features,
|
|
extract_xy,
|
|
fit_label_encoders,
|
|
fit_outlier_bounds,
|
|
make_target_bins,
|
|
normalize_columns,
|
|
prepare_modeling_dataframe,
|
|
to_float_array,
|
|
)
|
|
from core.preprocessing import get_clean_data
|
|
|
|
try:
|
|
import lightgbm as lgb
|
|
except ImportError:
|
|
lgb = None
|
|
|
|
try:
|
|
import xgboost as xgb
|
|
except ImportError:
|
|
xgb = None
|
|
|
|
|
|
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
|
|
elapsed = time.time() - start_time
|
|
print(f' {"-" * 50}')
|
|
print(f' Model: {model_name}')
|
|
print(f' Time: {elapsed:.1f}s')
|
|
print(f' Best CV R2: {best_score:.4f}')
|
|
for key, value in best_params.items():
|
|
print(f' - {key}: {value}')
|
|
print(f' Iterations: {n_iter}, CV folds: {cv_folds}')
|
|
|
|
|
|
class OptimizedModelTrainer:
|
|
def __init__(self):
|
|
self.models = {}
|
|
self.scaler = RobustScaler()
|
|
self.feature_names = None
|
|
self.selected_features = None
|
|
self.label_encoders = {}
|
|
self.model_metrics = {}
|
|
self.training_metadata = {}
|
|
self.feature_selector = None
|
|
self.outlier_bounds = {}
|
|
self.feature_k = 22
|
|
self.target_transform = 'log1p'
|
|
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
|
|
|
|
def analyze_data(self, df):
|
|
y = df[TARGET_COLUMN]
|
|
print('\nData Analysis')
|
|
print(f' Samples: {len(df)}')
|
|
print(f' Mean: {y.mean():.2f}, Median: {y.median():.2f}, Std: {y.std():.2f}')
|
|
print(f' High risk ratio (>8h): {(y > 8).mean() * 100:.1f}%')
|
|
|
|
def select_features(self, X, y, k=20):
|
|
selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
|
|
selector.fit(X, y)
|
|
self.feature_selector = selector
|
|
mask = selector.get_support()
|
|
self.selected_features = [name for name, keep in zip(self.feature_names, mask) if keep]
|
|
return selector.transform(X)
|
|
|
|
def transform_target(self, y):
|
|
return np.log1p(np.clip(y, a_min=0, a_max=None)) if self.target_transform == 'log1p' else y
|
|
|
|
def inverse_transform_target(self, y_pred):
|
|
return np.expm1(y_pred) if self.target_transform == 'log1p' else y_pred
|
|
|
|
def transform_features(self, X_df):
|
|
X_df = align_feature_frame(X_df, self.feature_names)
|
|
X = self.scaler.transform(to_float_array(X_df))
|
|
return self.feature_selector.transform(X) if self.feature_selector else X
|
|
|
|
def prepare_data(self):
|
|
df = normalize_columns(get_clean_data())
|
|
df = prepare_modeling_dataframe(df)
|
|
self.analyze_data(df)
|
|
|
|
target_bins = make_target_bins(df[TARGET_COLUMN].values)
|
|
train_df, test_df = train_test_split(
|
|
df,
|
|
test_size=config.TEST_SIZE,
|
|
random_state=config.RANDOM_STATE,
|
|
stratify=target_bins,
|
|
)
|
|
train_df = train_df.reset_index(drop=True)
|
|
test_df = test_df.reset_index(drop=True)
|
|
|
|
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
|
|
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
|
|
test_df = apply_outlier_bounds(test_df, self.outlier_bounds)
|
|
|
|
train_df = engineer_features(train_df)
|
|
test_df = engineer_features(test_df)
|
|
X_train_df, y_train = extract_xy(train_df)
|
|
X_test_df, y_test = extract_xy(test_df)
|
|
|
|
X_train_df, self.label_encoders = fit_label_encoders(X_train_df, ORDINAL_COLUMNS)
|
|
X_test_df = apply_label_encoders(X_test_df, self.label_encoders)
|
|
|
|
self.feature_names = list(X_train_df.columns)
|
|
X_test_df = align_feature_frame(X_test_df, self.feature_names)
|
|
X_train = self.scaler.fit_transform(to_float_array(X_train_df))
|
|
X_test = self.scaler.transform(to_float_array(X_test_df))
|
|
|
|
transformed_target = self.transform_target(y_train)
|
|
X_train = self.select_features(X_train, transformed_target, k=self.feature_k)
|
|
X_test = self.transform_features(X_test_df)
|
|
|
|
self.training_metadata = {
|
|
'train_samples': int(len(train_df)),
|
|
'test_samples': int(len(test_df)),
|
|
'feature_count_before_selection': int(len(self.feature_names)),
|
|
'feature_count_after_selection': int(X_train.shape[1]),
|
|
'training_date': datetime.now().strftime('%Y-%m-%d'),
|
|
'target_transform': self.target_transform,
|
|
'available_models': list(self.enabled_models),
|
|
}
|
|
return X_train, X_test, y_train, y_test
|
|
|
|
def _run_search(self, name, estimator, params, X_train, y_train, n_iter=12):
|
|
start_time = time.time()
|
|
search = RandomizedSearchCV(
|
|
estimator,
|
|
param_distributions=params,
|
|
n_iter=n_iter,
|
|
cv=4,
|
|
scoring='r2',
|
|
n_jobs=-1,
|
|
random_state=config.RANDOM_STATE,
|
|
)
|
|
search.fit(X_train, y_train)
|
|
self.models[name] = search.best_estimator_
|
|
print_training_log(name, start_time, search.best_score_, search.best_params_, n_iter, 4)
|
|
|
|
def train_random_forest(self, X_train, y_train):
|
|
self._run_search(
|
|
'random_forest',
|
|
RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
|
{
|
|
'n_estimators': [200, 300, 400],
|
|
'max_depth': [10, 14, 18, None],
|
|
'min_samples_split': [2, 4, 8],
|
|
'min_samples_leaf': [1, 2, 3],
|
|
'max_features': ['sqrt', 0.7],
|
|
},
|
|
X_train,
|
|
y_train,
|
|
)
|
|
|
|
def train_gradient_boosting(self, X_train, y_train):
|
|
self._run_search(
|
|
'gradient_boosting',
|
|
GradientBoostingRegressor(random_state=config.RANDOM_STATE),
|
|
{
|
|
'n_estimators': [160, 220, 300],
|
|
'max_depth': [3, 4, 5],
|
|
'learning_rate': [0.03, 0.05, 0.08],
|
|
'subsample': [0.7, 0.85, 1.0],
|
|
'min_samples_split': [2, 4, 6],
|
|
'min_samples_leaf': [1, 2, 3],
|
|
},
|
|
X_train,
|
|
y_train,
|
|
)
|
|
|
|
def train_extra_trees(self, X_train, y_train):
|
|
self._run_search(
|
|
'extra_trees',
|
|
ExtraTreesRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
|
{
|
|
'n_estimators': [220, 320, 420],
|
|
'max_depth': [10, 15, 20, None],
|
|
'min_samples_split': [2, 4, 8],
|
|
'min_samples_leaf': [1, 2, 3],
|
|
'max_features': ['sqrt', 0.7],
|
|
},
|
|
X_train,
|
|
y_train,
|
|
)
|
|
|
|
def train_lightgbm(self, X_train, y_train):
|
|
if lgb is None:
|
|
return
|
|
self._run_search(
|
|
'lightgbm',
|
|
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
|
|
{
|
|
'n_estimators': [180, 260, 340],
|
|
'max_depth': [7, 9, -1],
|
|
'learning_rate': [0.03, 0.05, 0.08],
|
|
'subsample': [0.7, 0.85, 1.0],
|
|
'colsample_bytree': [0.7, 0.85, 1.0],
|
|
'num_leaves': [31, 50, 70],
|
|
},
|
|
X_train,
|
|
y_train,
|
|
)
|
|
|
|
def train_xgboost(self, X_train, y_train):
|
|
if xgb is None:
|
|
return
|
|
self._run_search(
|
|
'xgboost',
|
|
xgb.XGBRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
|
{
|
|
'n_estimators': [180, 260, 340],
|
|
'max_depth': [4, 6, 8],
|
|
'learning_rate': [0.03, 0.05, 0.08],
|
|
'subsample': [0.7, 0.85, 1.0],
|
|
'colsample_bytree': [0.7, 0.85, 1.0],
|
|
'min_child_weight': [1, 3, 5],
|
|
},
|
|
X_train,
|
|
y_train,
|
|
)
|
|
|
|
def evaluate_model(self, model, X_test, y_test):
|
|
y_pred = self.inverse_transform_target(model.predict(X_test))
|
|
y_pred = np.clip(y_pred, a_min=0, a_max=None)
|
|
mse = mean_squared_error(y_test, y_pred)
|
|
return {
|
|
'r2': round(r2_score(y_test, y_pred), 4),
|
|
'mse': round(mse, 4),
|
|
'rmse': round(np.sqrt(mse), 4),
|
|
'mae': round(mean_absolute_error(y_test, y_pred), 4),
|
|
}
|
|
|
|
def save_models(self):
|
|
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
|
for name, model in self.models.items():
|
|
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
|
|
joblib.dump(self.scaler, config.SCALER_PATH)
|
|
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
|
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
|
|
joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
|
|
joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
|
|
joblib.dump(self.training_metadata, os.path.join(config.MODELS_DIR, 'training_metadata.pkl'))
|
|
|
|
def train_all(self):
|
|
print('\nOptimized Model Training Started')
|
|
X_train, X_test, y_train, y_test = self.prepare_data()
|
|
y_train_transformed = self.transform_target(y_train)
|
|
|
|
if 'random_forest' in self.enabled_models:
|
|
self.train_random_forest(X_train, y_train_transformed)
|
|
if 'gradient_boosting' in self.enabled_models:
|
|
self.train_gradient_boosting(X_train, y_train_transformed)
|
|
if 'extra_trees' in self.enabled_models:
|
|
self.train_extra_trees(X_train, y_train_transformed)
|
|
if 'lightgbm' in self.enabled_models:
|
|
self.train_lightgbm(X_train, y_train_transformed)
|
|
if 'xgboost' in self.enabled_models:
|
|
self.train_xgboost(X_train, y_train_transformed)
|
|
|
|
for name, model in self.models.items():
|
|
metrics = self.evaluate_model(model, X_test, y_test)
|
|
self.model_metrics[name] = metrics
|
|
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
|
|
|
|
self.save_models()
|
|
return self.model_metrics
|
|
|
|
|
|
def train_and_save_models():
|
|
start = time.time()
|
|
trainer = OptimizedModelTrainer()
|
|
metrics = trainer.train_all()
|
|
print(f'\nTraining Complete in {time.time() - start:.1f}s')
|
|
for idx, (name, metric) in enumerate(sorted(metrics.items(), key=lambda item: item[1]['r2'], reverse=True), start=1):
|
|
print(f'{idx}. {name} - R2={metric["r2"]:.4f}')
|
|
return metrics
|
|
|
|
|
|
if __name__ == '__main__':
|
|
train_and_save_models()
|