fix(training): patch lightgbm sklearn compatibility
This commit is contained in:
299
backend/core/deep_learning_model.py
Normal file
299
backend/core/deep_learning_model.py
Normal file
@@ -0,0 +1,299 @@
|
||||
import os
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
|
||||
import config
|
||||
from core.model_features import engineer_features
|
||||
|
||||
try:
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
except ImportError:
|
||||
torch = None
|
||||
nn = None
|
||||
DataLoader = None
|
||||
TensorDataset = None
|
||||
|
||||
|
||||
WINDOW_SIZE = 5
|
||||
SEQUENCE_FEATURES = [
|
||||
'缺勤月份',
|
||||
'星期几',
|
||||
'是否节假日前后',
|
||||
'请假类型',
|
||||
'请假原因大类',
|
||||
'是否提供医院证明',
|
||||
'是否临时请假',
|
||||
'是否连续缺勤',
|
||||
'前一工作日是否加班',
|
||||
'月均加班时长',
|
||||
'通勤时长分钟',
|
||||
'是否夜班岗位',
|
||||
'是否慢性病史',
|
||||
'加班通勤压力指数',
|
||||
'缺勤历史强度',
|
||||
]
|
||||
STATIC_FEATURES = [
|
||||
'所属行业',
|
||||
'婚姻状态',
|
||||
'岗位序列',
|
||||
'岗位级别',
|
||||
'年龄',
|
||||
'司龄年数',
|
||||
'子女数量',
|
||||
'班次类型',
|
||||
'绩效等级',
|
||||
'BMI',
|
||||
'健康风险指数',
|
||||
'家庭负担指数',
|
||||
'岗位稳定性指数',
|
||||
]
|
||||
|
||||
|
||||
class LSTMMLPRegressor(nn.Module):
|
||||
def __init__(self, seq_input_dim: int, static_input_dim: int):
|
||||
super().__init__()
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=seq_input_dim,
|
||||
hidden_size=48,
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
dropout=0.0,
|
||||
)
|
||||
self.static_net = nn.Sequential(
|
||||
nn.Linear(static_input_dim, 32),
|
||||
nn.ReLU(),
|
||||
nn.Dropout(0.1),
|
||||
)
|
||||
self.fusion = nn.Sequential(
|
||||
nn.Linear(48 + 32, 48),
|
||||
nn.ReLU(),
|
||||
nn.Dropout(0.1),
|
||||
nn.Linear(48, 1),
|
||||
)
|
||||
|
||||
def forward(self, sequence_x, static_x):
|
||||
lstm_output, _ = self.lstm(sequence_x)
|
||||
sequence_repr = lstm_output[:, -1, :]
|
||||
static_repr = self.static_net(static_x)
|
||||
fused = torch.cat([sequence_repr, static_repr], dim=1)
|
||||
return self.fusion(fused).squeeze(1)
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
return torch is not None
|
||||
|
||||
|
||||
def _fit_category_maps(df: pd.DataFrame, features: List[str]) -> Dict[str, Dict[str, int]]:
|
||||
category_maps = {}
|
||||
for feature in features:
|
||||
if feature not in df.columns:
|
||||
continue
|
||||
if pd.api.types.is_numeric_dtype(df[feature]):
|
||||
continue
|
||||
values = sorted(df[feature].astype(str).unique().tolist())
|
||||
category_maps[feature] = {value: idx for idx, value in enumerate(values)}
|
||||
return category_maps
|
||||
|
||||
|
||||
def _apply_category_maps(df: pd.DataFrame, features: List[str], category_maps: Dict[str, Dict[str, int]]) -> pd.DataFrame:
|
||||
encoded = df.copy()
|
||||
for feature in features:
|
||||
if feature not in encoded.columns:
|
||||
encoded[feature] = 0
|
||||
continue
|
||||
if feature in category_maps:
|
||||
mapper = category_maps[feature]
|
||||
encoded[feature] = encoded[feature].astype(str).map(lambda value: mapper.get(value, 0))
|
||||
return encoded
|
||||
|
||||
|
||||
def _safe_standardize(values: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
mean = values.mean(axis=0)
|
||||
std = values.std(axis=0)
|
||||
std = np.where(std < 1e-6, 1.0, std)
|
||||
return mean.astype(np.float32), std.astype(np.float32)
|
||||
|
||||
|
||||
def _build_sequence_arrays(
|
||||
df: pd.DataFrame,
|
||||
category_maps: Dict[str, Dict[str, int]],
|
||||
target_transform: str,
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
df = engineer_features(df.copy())
|
||||
features = sorted(set(SEQUENCE_FEATURES + STATIC_FEATURES))
|
||||
df = _apply_category_maps(df, features, category_maps)
|
||||
df = df.sort_values(
|
||||
[config.EMPLOYEE_ID_COLUMN, config.EVENT_DATE_INDEX_COLUMN, config.EVENT_SEQUENCE_COLUMN]
|
||||
).reset_index(drop=True)
|
||||
|
||||
sequence_samples = []
|
||||
static_samples = []
|
||||
targets = []
|
||||
|
||||
for _, group in df.groupby(config.EMPLOYEE_ID_COLUMN, sort=False):
|
||||
seq_values = group[SEQUENCE_FEATURES].astype(float).values
|
||||
static_values = group[STATIC_FEATURES].astype(float).values
|
||||
target_values = group[config.TARGET_COLUMN].astype(float).values
|
||||
|
||||
for index in range(len(group)):
|
||||
window_slice = seq_values[max(0, index - WINDOW_SIZE + 1): index + 1]
|
||||
sequence_window = np.zeros((WINDOW_SIZE, len(SEQUENCE_FEATURES)), dtype=np.float32)
|
||||
sequence_window[-len(window_slice):] = window_slice
|
||||
sequence_samples.append(sequence_window)
|
||||
static_samples.append(static_values[index].astype(np.float32))
|
||||
targets.append(float(target_values[index]))
|
||||
|
||||
targets = np.array(targets, dtype=np.float32)
|
||||
if target_transform == 'log1p':
|
||||
targets = np.log1p(np.clip(targets, a_min=0, a_max=None)).astype(np.float32)
|
||||
|
||||
return (
|
||||
np.array(sequence_samples, dtype=np.float32),
|
||||
np.array(static_samples, dtype=np.float32),
|
||||
targets,
|
||||
)
|
||||
|
||||
|
||||
def train_lstm_mlp(
|
||||
train_df: pd.DataFrame,
|
||||
test_df: pd.DataFrame,
|
||||
model_path: str,
|
||||
target_transform: str = 'log1p',
|
||||
epochs: int = 24,
|
||||
batch_size: int = 128,
|
||||
) -> Optional[Dict]:
|
||||
if torch is None:
|
||||
return None
|
||||
|
||||
used_features = sorted(set(SEQUENCE_FEATURES + STATIC_FEATURES))
|
||||
category_maps = _fit_category_maps(train_df, used_features)
|
||||
train_seq, train_static, y_train = _build_sequence_arrays(train_df, category_maps, target_transform)
|
||||
test_seq, test_static, y_test_transformed = _build_sequence_arrays(test_df, category_maps, target_transform)
|
||||
|
||||
seq_mean, seq_std = _safe_standardize(train_seq.reshape(-1, train_seq.shape[-1]))
|
||||
static_mean, static_std = _safe_standardize(train_static)
|
||||
|
||||
train_seq = ((train_seq - seq_mean) / seq_std).astype(np.float32)
|
||||
test_seq = ((test_seq - seq_mean) / seq_std).astype(np.float32)
|
||||
train_static = ((train_static - static_mean) / static_std).astype(np.float32)
|
||||
test_static = ((test_static - static_mean) / static_std).astype(np.float32)
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
if device.type == 'cuda':
|
||||
device_name = torch.cuda.get_device_name(device)
|
||||
print(f'[lstm_mlp] Training device: CUDA ({device_name})')
|
||||
else:
|
||||
print('[lstm_mlp] Training device: CPU')
|
||||
model = LSTMMLPRegressor(seq_input_dim=train_seq.shape[-1], static_input_dim=train_static.shape[-1]).to(device)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
||||
criterion = nn.MSELoss()
|
||||
|
||||
train_dataset = TensorDataset(
|
||||
torch.tensor(train_seq),
|
||||
torch.tensor(train_static),
|
||||
torch.tensor(y_train),
|
||||
)
|
||||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
model.train()
|
||||
for _ in range(epochs):
|
||||
for batch_seq, batch_static, batch_target in train_loader:
|
||||
batch_seq = batch_seq.to(device)
|
||||
batch_static = batch_static.to(device)
|
||||
batch_target = batch_target.to(device)
|
||||
|
||||
optimizer.zero_grad()
|
||||
predictions = model(batch_seq, batch_static)
|
||||
loss = criterion(predictions, batch_target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
predictions = model(
|
||||
torch.tensor(test_seq).to(device),
|
||||
torch.tensor(test_static).to(device),
|
||||
).cpu().numpy()
|
||||
|
||||
if target_transform == 'log1p':
|
||||
y_pred = np.expm1(predictions)
|
||||
else:
|
||||
y_pred = predictions
|
||||
y_true = test_df[config.TARGET_COLUMN].astype(float).values
|
||||
y_pred = np.clip(y_pred, a_min=0, a_max=None)
|
||||
mse = mean_squared_error(y_true, y_pred)
|
||||
|
||||
default_prefix = train_seq[:, :-1, :].mean(axis=0).astype(np.float32)
|
||||
bundle = {
|
||||
'state_dict': model.state_dict(),
|
||||
'sequence_features': SEQUENCE_FEATURES,
|
||||
'static_features': STATIC_FEATURES,
|
||||
'category_maps': category_maps,
|
||||
'seq_mean': seq_mean,
|
||||
'seq_std': seq_std,
|
||||
'static_mean': static_mean,
|
||||
'static_std': static_std,
|
||||
'default_sequence_prefix': default_prefix,
|
||||
'window_size': WINDOW_SIZE,
|
||||
'target_transform': target_transform,
|
||||
'sequence_input_dim': train_seq.shape[-1],
|
||||
'static_input_dim': train_static.shape[-1],
|
||||
}
|
||||
torch.save(bundle, model_path)
|
||||
|
||||
return {
|
||||
'metrics': {
|
||||
'r2': round(r2_score(y_true, y_pred), 4),
|
||||
'mse': round(mse, 4),
|
||||
'rmse': round(float(np.sqrt(mse)), 4),
|
||||
'mae': round(mean_absolute_error(y_true, y_pred), 4),
|
||||
},
|
||||
'metadata': {
|
||||
'sequence_window_size': WINDOW_SIZE,
|
||||
'sequence_feature_names': SEQUENCE_FEATURES,
|
||||
'static_feature_names': STATIC_FEATURES,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_lstm_mlp_bundle(model_path: str) -> Optional[Dict]:
|
||||
if torch is None or not os.path.exists(model_path):
|
||||
return None
|
||||
bundle = torch.load(model_path, map_location='cpu')
|
||||
model = LSTMMLPRegressor(
|
||||
seq_input_dim=bundle['sequence_input_dim'],
|
||||
static_input_dim=bundle['static_input_dim'],
|
||||
)
|
||||
model.load_state_dict(bundle['state_dict'])
|
||||
model.eval()
|
||||
bundle['model'] = model
|
||||
return bundle
|
||||
|
||||
|
||||
def predict_lstm_mlp(bundle: Dict, current_df: pd.DataFrame) -> float:
|
||||
df = engineer_features(current_df.copy())
|
||||
used_features = sorted(set(bundle['sequence_features'] + bundle['static_features']))
|
||||
df = _apply_category_maps(df, used_features, bundle['category_maps'])
|
||||
|
||||
sequence_row = df[bundle['sequence_features']].astype(float).values[0].astype(np.float32)
|
||||
static_row = df[bundle['static_features']].astype(float).values[0].astype(np.float32)
|
||||
|
||||
prefix = bundle['default_sequence_prefix']
|
||||
sequence_window = np.vstack([prefix, sequence_row.reshape(1, -1)]).astype(np.float32)
|
||||
sequence_window = (sequence_window - bundle['seq_mean']) / bundle['seq_std']
|
||||
static_row = ((static_row - bundle['static_mean']) / bundle['static_std']).astype(np.float32)
|
||||
|
||||
with torch.no_grad():
|
||||
prediction = bundle['model'](
|
||||
torch.tensor(sequence_window).unsqueeze(0),
|
||||
torch.tensor(static_row).unsqueeze(0),
|
||||
).cpu().numpy()[0]
|
||||
|
||||
if bundle.get('target_transform') == 'log1p':
|
||||
prediction = np.expm1(prediction)
|
||||
return float(max(0.5, prediction))
|
||||
@@ -264,6 +264,28 @@ def sample_event(rng, employee):
|
||||
return event
|
||||
|
||||
|
||||
def attach_event_timeline(df):
|
||||
df = df.copy()
|
||||
rng = np.random.default_rng(config.RANDOM_STATE)
|
||||
base_date = np.datetime64('2025-01-01')
|
||||
timelines = []
|
||||
|
||||
for employee_id, group in df.groupby('员工编号', sort=False):
|
||||
group = group.copy().reset_index(drop=True)
|
||||
event_count = len(group)
|
||||
offsets = np.sort(rng.integers(0, 365, size=event_count))
|
||||
group['事件日期'] = [
|
||||
str(pd.Timestamp(base_date + np.timedelta64(int(offset), 'D')).date())
|
||||
for offset in offsets
|
||||
]
|
||||
group['事件日期索引'] = offsets.astype(int)
|
||||
group['事件序号'] = np.arange(1, event_count + 1)
|
||||
group['员工历史事件数'] = event_count
|
||||
timelines.append(group)
|
||||
|
||||
return pd.concat(timelines, ignore_index=True)
|
||||
|
||||
|
||||
def validate_dataset(df):
|
||||
required_columns = [
|
||||
'员工编号',
|
||||
@@ -273,6 +295,9 @@ def validate_dataset(df):
|
||||
'通勤时长分钟',
|
||||
'是否慢性病史',
|
||||
'请假类型',
|
||||
'事件序号',
|
||||
'事件日期索引',
|
||||
'员工历史事件数',
|
||||
'缺勤时长(小时)',
|
||||
]
|
||||
for column in required_columns:
|
||||
@@ -309,7 +334,7 @@ def generate_dataset(output_path=None, sample_count=12000, random_state=None):
|
||||
for idx in employee_idx:
|
||||
events.append(sample_event(rng, employees[int(idx)]))
|
||||
|
||||
df = pd.DataFrame(events)
|
||||
df = attach_event_timeline(pd.DataFrame(events))
|
||||
validate_dataset(df)
|
||||
|
||||
if output_path:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import inspect
|
||||
from datetime import datetime
|
||||
|
||||
import joblib
|
||||
@@ -14,6 +15,8 @@ from sklearn.preprocessing import RobustScaler
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import config
|
||||
from core.deep_learning_model import is_available as deep_learning_available
|
||||
from core.deep_learning_model import train_lstm_mlp
|
||||
from core.model_features import (
|
||||
NUMERICAL_OUTLIER_COLUMNS,
|
||||
ORDINAL_COLUMNS,
|
||||
@@ -43,6 +46,37 @@ except ImportError:
|
||||
xgb = None
|
||||
|
||||
|
||||
def patch_lightgbm_sklearn_compatibility():
|
||||
if lgb is None:
|
||||
return
|
||||
|
||||
try:
|
||||
from sklearn.utils.validation import check_X_y
|
||||
except Exception:
|
||||
return
|
||||
|
||||
params = inspect.signature(check_X_y).parameters
|
||||
if 'force_all_finite' in params or 'ensure_all_finite' not in params:
|
||||
return
|
||||
|
||||
def wrapped_check_X_y(*args, force_all_finite=None, **kwargs):
|
||||
if force_all_finite is not None and 'ensure_all_finite' not in kwargs:
|
||||
kwargs['ensure_all_finite'] = force_all_finite
|
||||
return check_X_y(*args, **kwargs)
|
||||
|
||||
try:
|
||||
import lightgbm.compat as lgb_compat
|
||||
import lightgbm.sklearn as lgb_sklearn
|
||||
|
||||
lgb_compat._LGBMCheckXY = wrapped_check_X_y
|
||||
lgb_sklearn._LGBMCheckXY = wrapped_check_X_y
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
patch_lightgbm_sklearn_compatibility()
|
||||
|
||||
|
||||
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
|
||||
elapsed = time.time() - start_time
|
||||
print(f' {"-" * 50}')
|
||||
@@ -68,6 +102,10 @@ class OptimizedModelTrainer:
|
||||
self.feature_k = 22
|
||||
self.target_transform = 'log1p'
|
||||
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
|
||||
if deep_learning_available():
|
||||
self.enabled_models.append('lstm_mlp')
|
||||
self.raw_train_df = None
|
||||
self.raw_test_df = None
|
||||
|
||||
def analyze_data(self, df):
|
||||
y = df[TARGET_COLUMN]
|
||||
@@ -96,19 +134,21 @@ class OptimizedModelTrainer:
|
||||
return self.feature_selector.transform(X) if self.feature_selector else X
|
||||
|
||||
def prepare_data(self):
|
||||
df = normalize_columns(get_clean_data())
|
||||
df = prepare_modeling_dataframe(df)
|
||||
self.analyze_data(df)
|
||||
raw_df = normalize_columns(get_clean_data())
|
||||
self.analyze_data(prepare_modeling_dataframe(raw_df.copy()))
|
||||
|
||||
target_bins = make_target_bins(df[TARGET_COLUMN].values)
|
||||
train_df, test_df = train_test_split(
|
||||
df,
|
||||
target_bins = make_target_bins(raw_df[TARGET_COLUMN].values)
|
||||
raw_train_df, raw_test_df = train_test_split(
|
||||
raw_df,
|
||||
test_size=config.TEST_SIZE,
|
||||
random_state=config.RANDOM_STATE,
|
||||
stratify=target_bins,
|
||||
)
|
||||
train_df = train_df.reset_index(drop=True)
|
||||
test_df = test_df.reset_index(drop=True)
|
||||
self.raw_train_df = raw_train_df.reset_index(drop=True)
|
||||
self.raw_test_df = raw_test_df.reset_index(drop=True)
|
||||
|
||||
train_df = prepare_modeling_dataframe(self.raw_train_df)
|
||||
test_df = prepare_modeling_dataframe(self.raw_test_df)
|
||||
|
||||
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
|
||||
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
|
||||
@@ -138,7 +178,8 @@ class OptimizedModelTrainer:
|
||||
'feature_count_after_selection': int(X_train.shape[1]),
|
||||
'training_date': datetime.now().strftime('%Y-%m-%d'),
|
||||
'target_transform': self.target_transform,
|
||||
'available_models': list(self.enabled_models),
|
||||
'available_models': [],
|
||||
'deep_learning_available': False,
|
||||
}
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
@@ -206,20 +247,25 @@ class OptimizedModelTrainer:
|
||||
def train_lightgbm(self, X_train, y_train):
|
||||
if lgb is None:
|
||||
return
|
||||
self._run_search(
|
||||
'lightgbm',
|
||||
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [7, 9, -1],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'num_leaves': [31, 50, 70],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
try:
|
||||
self._run_search(
|
||||
'lightgbm',
|
||||
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [7, 9, -1],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'num_leaves': [31, 50, 70],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f' {"-" * 50}')
|
||||
print(' Model: lightgbm')
|
||||
print(f' Skipped: {exc}')
|
||||
|
||||
def train_xgboost(self, X_train, y_train):
|
||||
if xgb is None:
|
||||
@@ -254,6 +300,7 @@ class OptimizedModelTrainer:
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
for name, model in self.models.items():
|
||||
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
|
||||
self.training_metadata['available_models'] = list(self.model_metrics.keys())
|
||||
joblib.dump(self.scaler, config.SCALER_PATH)
|
||||
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
||||
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
|
||||
@@ -282,6 +329,23 @@ class OptimizedModelTrainer:
|
||||
self.model_metrics[name] = metrics
|
||||
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
|
||||
|
||||
if 'lstm_mlp' in self.enabled_models and self.raw_train_df is not None and self.raw_test_df is not None:
|
||||
deep_model_path = os.path.join(config.MODELS_DIR, 'lstm_mlp_model.pt')
|
||||
deep_result = train_lstm_mlp(
|
||||
self.raw_train_df,
|
||||
self.raw_test_df,
|
||||
deep_model_path,
|
||||
target_transform=self.target_transform,
|
||||
)
|
||||
if deep_result:
|
||||
self.model_metrics['lstm_mlp'] = deep_result['metrics']
|
||||
self.training_metadata['deep_learning_available'] = True
|
||||
self.training_metadata.update(deep_result['metadata'])
|
||||
print(
|
||||
f' {"lstm_mlp":20s} R2={deep_result["metrics"]["r2"]:.4f} '
|
||||
f'RMSE={deep_result["metrics"]["rmse"]:.4f} MAE={deep_result["metrics"]["mae"]:.4f}'
|
||||
)
|
||||
|
||||
self.save_models()
|
||||
return self.model_metrics
|
||||
|
||||
|
||||
Reference in New Issue
Block a user