Files
forsetsystem/backend/core/train_model.py
shuo e8235bf3ca feat: 添加 JD-R 理论分析模块与 SHAP 可解释性分析功能
- 后端新增 JD-R(工作要求-资源)理论维度数据生成,包含工作要求、工作资源、
    个人资源、中介变量共 16 个新特征列
  - 新增 JD-R 分析服务与 API(维度统计、倦怠投入分析、双路径中介分析、
    分组轮廓、风险分布)
  - 新增 SHAP 可解释性分析模块(全局重要性、局部解释、特征交互、依赖图)
  - 预测服务增加风险分类模型加载与概率预测能力
  - 前端新增 JD-R 分析页面(JDRAnalysis.vue),含雷达图、散点图、路径分析等可视化
  - 预测页面增加风险概率展示与 SHAP 特征解释
  - 路由与导航菜单同步更新
2026-04-04 07:15:46 +08:00

443 lines
17 KiB
Python

import os
import sys
import time
import inspect
from datetime import datetime
import joblib
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
from core.deep_learning_model import is_available as deep_learning_available
from core.deep_learning_model import train_lstm_mlp
from core.model_features import (
NUMERICAL_OUTLIER_COLUMNS,
ORDINAL_COLUMNS,
TARGET_COLUMN,
align_feature_frame,
apply_label_encoders,
apply_outlier_bounds,
engineer_features,
extract_xy,
fit_label_encoders,
fit_outlier_bounds,
make_target_bins,
normalize_columns,
prepare_modeling_dataframe,
to_float_array,
)
from core.preprocessing import get_clean_data
try:
import lightgbm as lgb
except ImportError:
lgb = None
try:
import xgboost as xgb
except ImportError:
xgb = None
def patch_lightgbm_sklearn_compatibility():
if lgb is None:
return
try:
from sklearn.utils.validation import check_X_y
except Exception:
return
params = inspect.signature(check_X_y).parameters
if 'force_all_finite' in params:
return
def wrapped_check_X_y(*args, force_all_finite=None, **kwargs):
if (
force_all_finite is not None
and 'ensure_all_finite' in params
and 'ensure_all_finite' not in kwargs
):
kwargs['ensure_all_finite'] = force_all_finite
return check_X_y(*args, **kwargs)
try:
import lightgbm.compat as lgb_compat
import lightgbm.sklearn as lgb_sklearn
lgb_compat._LGBMCheckXY = wrapped_check_X_y
lgb_sklearn._LGBMCheckXY = wrapped_check_X_y
except Exception:
pass
patch_lightgbm_sklearn_compatibility()
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
elapsed = time.time() - start_time
print(f' {"-" * 50}')
print(f' Model: {model_name}')
print(f' Time: {elapsed:.1f}s')
print(f' Best CV R2: {best_score:.4f}')
for key, value in best_params.items():
print(f' - {key}: {value}')
print(f' Iterations: {n_iter}, CV folds: {cv_folds}')
class OptimizedModelTrainer:
def __init__(self):
self.models = {}
self.scaler = RobustScaler()
self.feature_names = None
self.selected_features = None
self.label_encoders = {}
self.model_metrics = {}
self.training_metadata = {}
self.feature_selector = None
self.outlier_bounds = {}
self.feature_k = 22
self.target_transform = 'log1p'
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
if deep_learning_available():
self.enabled_models.append('lstm_mlp')
self.raw_train_df = None
self.raw_test_df = None
def analyze_data(self, df):
y = df[TARGET_COLUMN]
print('\nData Analysis')
print(f' Samples: {len(df)}')
print(f' Mean: {y.mean():.2f}, Median: {y.median():.2f}, Std: {y.std():.2f}')
print(f' High risk ratio (>8h): {(y > 8).mean() * 100:.1f}%')
def select_features(self, X, y, k=20):
selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
selector.fit(X, y)
self.feature_selector = selector
mask = selector.get_support()
self.selected_features = [name for name, keep in zip(self.feature_names, mask) if keep]
return selector.transform(X)
def transform_target(self, y):
return np.log1p(np.clip(y, a_min=0, a_max=None)) if self.target_transform == 'log1p' else y
def inverse_transform_target(self, y_pred):
return np.expm1(y_pred) if self.target_transform == 'log1p' else y_pred
def transform_features(self, X_df):
X_df = align_feature_frame(X_df, self.feature_names)
X = self.scaler.transform(to_float_array(X_df))
return self.feature_selector.transform(X) if self.feature_selector else X
def prepare_data(self):
raw_df = normalize_columns(get_clean_data())
self.analyze_data(prepare_modeling_dataframe(raw_df.copy()))
target_bins = make_target_bins(raw_df[TARGET_COLUMN].values)
raw_train_df, raw_test_df = train_test_split(
raw_df,
test_size=config.TEST_SIZE,
random_state=config.RANDOM_STATE,
stratify=target_bins,
)
self.raw_train_df = raw_train_df.reset_index(drop=True)
self.raw_test_df = raw_test_df.reset_index(drop=True)
train_df = prepare_modeling_dataframe(self.raw_train_df)
test_df = prepare_modeling_dataframe(self.raw_test_df)
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
test_df = apply_outlier_bounds(test_df, self.outlier_bounds)
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)
X_train_df, y_train = extract_xy(train_df)
X_test_df, y_test = extract_xy(test_df)
X_train_df, self.label_encoders = fit_label_encoders(X_train_df, ORDINAL_COLUMNS)
X_test_df = apply_label_encoders(X_test_df, self.label_encoders)
self.feature_names = list(X_train_df.columns)
X_test_df = align_feature_frame(X_test_df, self.feature_names)
X_train = self.scaler.fit_transform(to_float_array(X_train_df))
X_test = self.scaler.transform(to_float_array(X_test_df))
transformed_target = self.transform_target(y_train)
X_train = self.select_features(X_train, transformed_target, k=self.feature_k)
X_test = self.transform_features(X_test_df)
self.training_metadata = {
'train_samples': int(len(train_df)),
'test_samples': int(len(test_df)),
'feature_count_before_selection': int(len(self.feature_names)),
'feature_count_after_selection': int(X_train.shape[1]),
'training_date': datetime.now().strftime('%Y-%m-%d'),
'target_transform': self.target_transform,
'available_models': [],
'deep_learning_available': False,
}
return X_train, X_test, y_train, y_test
def _run_search(self, name, estimator, params, X_train, y_train, n_iter=12):
start_time = time.time()
search = RandomizedSearchCV(
estimator,
param_distributions=params,
n_iter=n_iter,
cv=4,
scoring='r2',
n_jobs=-1,
random_state=config.RANDOM_STATE,
)
search.fit(X_train, y_train)
self.models[name] = search.best_estimator_
print_training_log(name, start_time, search.best_score_, search.best_params_, n_iter, 4)
def train_random_forest(self, X_train, y_train):
self._run_search(
'random_forest',
RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
{
'n_estimators': [200, 300, 400],
'max_depth': [10, 14, 18, None],
'min_samples_split': [2, 4, 8],
'min_samples_leaf': [1, 2, 3],
'max_features': ['sqrt', 0.7],
},
X_train,
y_train,
)
def train_gradient_boosting(self, X_train, y_train):
self._run_search(
'gradient_boosting',
GradientBoostingRegressor(random_state=config.RANDOM_STATE),
{
'n_estimators': [160, 220, 300],
'max_depth': [3, 4, 5],
'learning_rate': [0.03, 0.05, 0.08],
'subsample': [0.7, 0.85, 1.0],
'min_samples_split': [2, 4, 6],
'min_samples_leaf': [1, 2, 3],
},
X_train,
y_train,
)
def train_extra_trees(self, X_train, y_train):
self._run_search(
'extra_trees',
ExtraTreesRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
{
'n_estimators': [220, 320, 420],
'max_depth': [10, 15, 20, None],
'min_samples_split': [2, 4, 8],
'min_samples_leaf': [1, 2, 3],
'max_features': ['sqrt', 0.7],
},
X_train,
y_train,
)
def train_lightgbm(self, X_train, y_train):
if lgb is None:
return
try:
self._run_search(
'lightgbm',
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
{
'n_estimators': [180, 260, 340],
'max_depth': [7, 9, -1],
'learning_rate': [0.03, 0.05, 0.08],
'subsample': [0.7, 0.85, 1.0],
'colsample_bytree': [0.7, 0.85, 1.0],
'num_leaves': [31, 50, 70],
},
X_train,
y_train,
)
except Exception as exc:
print(f' {"-" * 50}')
print(' Model: lightgbm')
print(f' Skipped: {exc}')
def train_xgboost(self, X_train, y_train):
if xgb is None:
return
self._run_search(
'xgboost',
xgb.XGBRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
{
'n_estimators': [180, 260, 340],
'max_depth': [4, 6, 8],
'learning_rate': [0.03, 0.05, 0.08],
'subsample': [0.7, 0.85, 1.0],
'colsample_bytree': [0.7, 0.85, 1.0],
'min_child_weight': [1, 3, 5],
},
X_train,
y_train,
)
def evaluate_model(self, model, X_test, y_test):
y_pred = self.inverse_transform_target(model.predict(X_test))
y_pred = np.clip(y_pred, a_min=0, a_max=None)
mse = mean_squared_error(y_test, y_pred)
return {
'r2': round(r2_score(y_test, y_pred), 4),
'mse': round(mse, 4),
'rmse': round(np.sqrt(mse), 4),
'mae': round(mean_absolute_error(y_test, y_pred), 4),
}
def save_models(self):
os.makedirs(config.MODELS_DIR, exist_ok=True)
for name, model in self.models.items():
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
self.training_metadata['available_models'] = list(self.model_metrics.keys())
joblib.dump(self.scaler, config.SCALER_PATH)
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
joblib.dump(self.training_metadata, os.path.join(config.MODELS_DIR, 'training_metadata.pkl'))
def train_all(self):
print('\nOptimized Model Training Started')
X_train, X_test, y_train, y_test = self.prepare_data()
y_train_transformed = self.transform_target(y_train)
if 'random_forest' in self.enabled_models:
self.train_random_forest(X_train, y_train_transformed)
if 'gradient_boosting' in self.enabled_models:
self.train_gradient_boosting(X_train, y_train_transformed)
if 'extra_trees' in self.enabled_models:
self.train_extra_trees(X_train, y_train_transformed)
if 'lightgbm' in self.enabled_models:
self.train_lightgbm(X_train, y_train_transformed)
if 'xgboost' in self.enabled_models:
self.train_xgboost(X_train, y_train_transformed)
for name, model in self.models.items():
metrics = self.evaluate_model(model, X_test, y_test)
self.model_metrics[name] = metrics
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
if 'lstm_mlp' in self.enabled_models and self.raw_train_df is not None and self.raw_test_df is not None:
deep_model_path = os.path.join(config.MODELS_DIR, 'lstm_mlp_model.pt')
deep_result = train_lstm_mlp(
self.raw_train_df,
self.raw_test_df,
deep_model_path,
target_transform=self.target_transform,
)
if deep_result:
self.model_metrics['lstm_mlp'] = deep_result['metrics']
self.training_metadata['deep_learning_available'] = True
self.training_metadata.update(deep_result['metadata'])
print(
f' {"lstm_mlp":20s} R2={deep_result["metrics"]["r2"]:.4f} '
f'RMSE={deep_result["metrics"]["rmse"]:.4f} MAE={deep_result["metrics"]["mae"]:.4f}'
)
self.save_models()
# 风险分类模型训练
print('\nRisk Classification Training')
risk_trainer = RiskClassifierTrainer(self)
risk_trainer.train_all(X_train, y_train, X_test, y_test)
risk_trainer.save()
return self.model_metrics
class RiskClassifierTrainer:
"""风险等级分类模型训练器:低(<4h) / 中(4-8h) / 高(>8h)"""
RISK_MAP = {'low': 0, 'medium': 1, 'high': 2}
RISK_LABELS = ['low', 'medium', 'high']
def __init__(self, regression_trainer):
self.regression_trainer = regression_trainer
self.classifiers = {}
self.classification_metrics = {}
def _make_target(self, y_hours):
y_class = np.full(len(y_hours), 1, dtype=int)
y_class[y_hours < 4] = 0
y_class[y_hours > 8] = 2
return y_class
def train_all(self, X_train, y_train_hours, X_test, y_test_hours):
y_train_cls = self._make_target(y_train_hours)
y_test_cls = self._make_target(y_test_hours)
classifier_configs = {
'random_forest': RandomForestClassifier(
n_estimators=300, max_depth=14, random_state=config.RANDOM_STATE, n_jobs=-1,
),
'gradient_boosting': GradientBoostingClassifier(
n_estimators=200, max_depth=4, learning_rate=0.05, random_state=config.RANDOM_STATE,
),
}
if lgb is not None:
classifier_configs['lightgbm'] = lgb.LGBMClassifier(
n_estimators=260, max_depth=7, learning_rate=0.05,
random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1,
)
if xgb is not None:
classifier_configs['xgboost'] = xgb.XGBClassifier(
n_estimators=260, max_depth=6, learning_rate=0.05,
random_state=config.RANDOM_STATE, n_jobs=-1,
)
for name, clf in classifier_configs.items():
try:
clf.fit(X_train, y_train_cls)
y_pred = clf.predict(X_test)
self.classifiers[name] = clf
self.classification_metrics[name] = {
'accuracy': round(accuracy_score(y_test_cls, y_pred), 4),
'precision_macro': round(precision_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
'recall_macro': round(recall_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
'f1_macro': round(f1_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
'confusion_matrix': confusion_matrix(y_test_cls, y_pred).tolist(),
}
m = self.classification_metrics[name]
print(f' {name:20s} Acc={m["accuracy"]:.4f} F1={m["f1_macro"]:.4f}')
except Exception as exc:
print(f' {name:20s} Skipped: {exc}')
def save(self):
for name, clf in self.classifiers.items():
path = os.path.join(config.MODELS_DIR, f'risk_{name}_classifier.pkl')
joblib.dump(clf, path)
joblib.dump(self.classification_metrics, os.path.join(config.MODELS_DIR, 'classification_metrics.pkl'))
def train_and_save_models():
start = time.time()
trainer = OptimizedModelTrainer()
metrics = trainer.train_all()
print(f'\nTraining Complete in {time.time() - start:.1f}s')
for idx, (name, metric) in enumerate(sorted(metrics.items(), key=lambda item: item[1]['r2'], reverse=True), start=1):
print(f'{idx}. {name} - R2={metric["r2"]:.4f}')
return metrics
if __name__ == '__main__':
train_and_save_models()