feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
This commit is contained in:
@@ -1,123 +1,57 @@
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
from sklearn.ensemble import (
|
||||
RandomForestRegressor,
|
||||
GradientBoostingRegressor,
|
||||
ExtraTreesRegressor,
|
||||
StackingRegressor
|
||||
)
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
||||
from sklearn.preprocessing import RobustScaler, LabelEncoder
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
||||
import xgboost as xgb
|
||||
import lightgbm as lgb
|
||||
import joblib
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
import config
|
||||
from core.model_features import (
|
||||
NUMERICAL_OUTLIER_COLUMNS,
|
||||
ORDINAL_COLUMNS,
|
||||
TARGET_COLUMN,
|
||||
align_feature_frame,
|
||||
apply_label_encoders,
|
||||
apply_outlier_bounds,
|
||||
engineer_features,
|
||||
extract_xy,
|
||||
fit_label_encoders,
|
||||
fit_outlier_bounds,
|
||||
make_target_bins,
|
||||
normalize_columns,
|
||||
prepare_modeling_dataframe,
|
||||
to_float_array,
|
||||
)
|
||||
from core.preprocessing import get_clean_data
|
||||
|
||||
try:
|
||||
import lightgbm as lgb
|
||||
except ImportError:
|
||||
lgb = None
|
||||
|
||||
try:
|
||||
import xgboost as xgb
|
||||
except ImportError:
|
||||
xgb = None
|
||||
|
||||
|
||||
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
|
||||
elapsed = time.time() - start_time
|
||||
print(f" {'─'*50}")
|
||||
print(f" Model: {model_name}")
|
||||
print(f" Time: {elapsed:.1f}s")
|
||||
print(f" Best CV R2: {best_score:.4f}")
|
||||
print(f" Best params:")
|
||||
for k, v in best_params.items():
|
||||
print(f" - {k}: {v}")
|
||||
print(f" Iterations: {n_iter}, CV folds: {cv_folds}")
|
||||
print(f" {'─'*50}")
|
||||
|
||||
|
||||
class DataAugmenter:
|
||||
def __init__(self, noise_level=0.02, n_augment=2):
|
||||
self.noise_level = noise_level
|
||||
self.n_augment = n_augment
|
||||
|
||||
def augment(self, df, target_col='Absenteeism time in hours'):
|
||||
print(f"\nData Augmentation...")
|
||||
print(f" Original size: {len(df)}")
|
||||
|
||||
augmented_dfs = [df]
|
||||
|
||||
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
if target_col in numerical_cols:
|
||||
numerical_cols.remove(target_col)
|
||||
|
||||
for i in range(self.n_augment):
|
||||
df_aug = df.copy()
|
||||
|
||||
for col in numerical_cols:
|
||||
if col in df_aug.columns:
|
||||
std_val = df_aug[col].std()
|
||||
if std_val > 0:
|
||||
noise = np.random.normal(0, self.noise_level * std_val, len(df_aug))
|
||||
df_aug[col] = df_aug[col] + noise
|
||||
|
||||
augmented_dfs.append(df_aug)
|
||||
|
||||
df_result = pd.concat(augmented_dfs, ignore_index=True)
|
||||
print(f" Augmented size: {len(df_result)}")
|
||||
|
||||
return df_result
|
||||
|
||||
def smote_regression(self, df, target_col='Absenteeism time in hours'):
|
||||
df = df.copy()
|
||||
y = df[target_col].values
|
||||
|
||||
bins = [0, 1, 4, 8, 100]
|
||||
labels = ['zero', 'low', 'medium', 'high']
|
||||
df['_target_bin'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
|
||||
|
||||
bin_counts = df['_target_bin'].value_counts()
|
||||
max_count = bin_counts.max()
|
||||
|
||||
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
if target_col in numerical_cols:
|
||||
numerical_cols.remove(target_col)
|
||||
if '_target_bin' in numerical_cols:
|
||||
numerical_cols.remove('_target_bin')
|
||||
|
||||
augmented_rows = []
|
||||
for bin_label in labels:
|
||||
bin_df = df[df['_target_bin'] == bin_label].drop(columns=['_target_bin'])
|
||||
bin_size = len(bin_df)
|
||||
|
||||
if bin_size < max_count and bin_size > 0:
|
||||
n_samples_to_add = max_count - bin_size
|
||||
|
||||
for _ in range(n_samples_to_add):
|
||||
idx = np.random.choice(bin_df.index)
|
||||
sample = bin_df.loc[idx].copy()
|
||||
|
||||
for col in numerical_cols:
|
||||
if col in sample.index:
|
||||
std_val = bin_df[col].std()
|
||||
if std_val > 0:
|
||||
noise = np.random.normal(0, 0.02 * std_val)
|
||||
sample[col] = sample[col] + noise
|
||||
|
||||
augmented_rows.append(sample)
|
||||
|
||||
if augmented_rows:
|
||||
df_aug = pd.DataFrame(augmented_rows)
|
||||
df_result = pd.concat([df.drop(columns=['_target_bin']), df_aug], ignore_index=True)
|
||||
else:
|
||||
df_result = df.drop(columns=['_target_bin'])
|
||||
|
||||
print(f" After SMOTE-like augmentation: {len(df_result)}")
|
||||
|
||||
return df_result
|
||||
print(f' {"-" * 50}')
|
||||
print(f' Model: {model_name}')
|
||||
print(f' Time: {elapsed:.1f}s')
|
||||
print(f' Best CV R2: {best_score:.4f}')
|
||||
for key, value in best_params.items():
|
||||
print(f' - {key}: {value}')
|
||||
print(f' Iterations: {n_iter}, CV folds: {cv_folds}')
|
||||
|
||||
|
||||
class OptimizedModelTrainer:
|
||||
@@ -128,461 +62,237 @@ class OptimizedModelTrainer:
|
||||
self.selected_features = None
|
||||
self.label_encoders = {}
|
||||
self.model_metrics = {}
|
||||
self.augmenter = DataAugmenter(noise_level=0.02, n_augment=2)
|
||||
|
||||
self.training_metadata = {}
|
||||
self.feature_selector = None
|
||||
self.outlier_bounds = {}
|
||||
self.feature_k = 22
|
||||
self.target_transform = 'log1p'
|
||||
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
|
||||
|
||||
def analyze_data(self, df):
|
||||
print("\n" + "="*60)
|
||||
print("Data Analysis")
|
||||
print("="*60)
|
||||
|
||||
y = df['Absenteeism time in hours']
|
||||
|
||||
print(f"\nTarget variable statistics:")
|
||||
print(f" Min: {y.min()}")
|
||||
print(f" Max: {y.max()}")
|
||||
print(f" Mean: {y.mean():.2f}")
|
||||
print(f" Median: {y.median():.2f}")
|
||||
print(f" Std: {y.std():.2f}")
|
||||
print(f" Skewness: {y.skew():.2f}")
|
||||
|
||||
print(f"\nTarget distribution:")
|
||||
print(f" Zero values: {(y == 0).sum()} ({(y == 0).sum() / len(y) * 100:.1f}%)")
|
||||
print(f" 1-8 hours: {((y > 0) & (y <= 8)).sum()} ({((y > 0) & (y <= 8)).sum() / len(y) * 100:.1f}%)")
|
||||
print(f" >8 hours: {(y > 8).sum()} ({(y > 8).sum() / len(y) * 100:.1f}%)")
|
||||
|
||||
return y
|
||||
|
||||
def clip_outliers(self, df, columns, lower_pct=1, upper_pct=99):
|
||||
df_clean = df.copy()
|
||||
|
||||
for col in columns:
|
||||
if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']:
|
||||
if col == 'Absenteeism time in hours':
|
||||
continue
|
||||
lower = df_clean[col].quantile(lower_pct / 100)
|
||||
upper = df_clean[col].quantile(upper_pct / 100)
|
||||
df_clean[col] = df_clean[col].clip(lower, upper)
|
||||
|
||||
return df_clean
|
||||
|
||||
def feature_engineering(self, df):
|
||||
df = df.copy()
|
||||
|
||||
df['workload_per_age'] = df['Work load Average/day'] / (df['Age'] + 1)
|
||||
df['expense_per_distance'] = df['Transportation expense'] / (df['Distance from Residence to Work'] + 1)
|
||||
df['age_service_ratio'] = df['Age'] / (df['Service time'] + 1)
|
||||
|
||||
df['has_children'] = (df['Son'] > 0).astype(int)
|
||||
df['has_pet'] = (df['Pet'] > 0).astype(int)
|
||||
df['family_responsibility'] = df['Son'] + df['Pet']
|
||||
|
||||
df['health_risk'] = ((df['Social drinker'] == 1) | (df['Social smoker'] == 1) | (df['Body mass index'] > 30)).astype(int)
|
||||
df['lifestyle_risk'] = df['Social drinker'].astype(int) + df['Social smoker'].astype(int)
|
||||
|
||||
df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=[1, 2, 3, 4])
|
||||
df['service_group'] = pd.cut(df['Service time'], bins=[0, 5, 10, 20, 100], labels=[1, 2, 3, 4])
|
||||
df['bmi_category'] = pd.cut(df['Body mass index'], bins=[0, 18.5, 25, 30, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['workload_category'] = pd.cut(df['Work load Average/day'], bins=[0, 200, 250, 300, 500], labels=[1, 2, 3, 4])
|
||||
df['commute_category'] = pd.cut(df['Distance from Residence to Work'], bins=[0, 10, 20, 50, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['seasonal_risk'] = df['Seasons'].apply(lambda x: 1 if x in [1, 3] else 0)
|
||||
df['weekday_risk'] = df['Day of the week'].apply(lambda x: 1 if x in [2, 6] else 0)
|
||||
|
||||
df['hit_target_ratio'] = df['Hit target'] / 100
|
||||
df['experience_level'] = pd.cut(df['Service time'], bins=[0, 5, 10, 15, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['age_workload_interaction'] = df['Age'] * df['Work load Average/day'] / 10000
|
||||
df['service_bmi_interaction'] = df['Service time'] * df['Body mass index'] / 100
|
||||
|
||||
return df
|
||||
|
||||
y = df[TARGET_COLUMN]
|
||||
print('\nData Analysis')
|
||||
print(f' Samples: {len(df)}')
|
||||
print(f' Mean: {y.mean():.2f}, Median: {y.median():.2f}, Std: {y.std():.2f}')
|
||||
print(f' High risk ratio (>8h): {(y > 8).mean() * 100:.1f}%')
|
||||
|
||||
def select_features(self, X, y, k=20):
|
||||
print("\nFeature Selection...")
|
||||
|
||||
selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
|
||||
selector.fit(X, y)
|
||||
|
||||
scores = selector.scores_
|
||||
feature_scores = list(zip(self.feature_names, scores))
|
||||
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print(f"\nTop {min(k, len(feature_scores))} features by F-score:")
|
||||
for i, (name, score) in enumerate(feature_scores[:min(k, len(feature_scores))]):
|
||||
cn = config.FEATURE_NAME_CN.get(name, name)
|
||||
print(f" {i+1}. {cn}: {score:.2f}")
|
||||
|
||||
selected_mask = selector.get_support()
|
||||
self.selected_features = [f for f, s in zip(self.feature_names, selected_mask) if s]
|
||||
|
||||
self.feature_selector = selector
|
||||
mask = selector.get_support()
|
||||
self.selected_features = [name for name, keep in zip(self.feature_names, mask) if keep]
|
||||
return selector.transform(X)
|
||||
|
||||
|
||||
def transform_target(self, y):
|
||||
return np.log1p(np.clip(y, a_min=0, a_max=None)) if self.target_transform == 'log1p' else y
|
||||
|
||||
def inverse_transform_target(self, y_pred):
|
||||
return np.expm1(y_pred) if self.target_transform == 'log1p' else y_pred
|
||||
|
||||
def transform_features(self, X_df):
|
||||
X_df = align_feature_frame(X_df, self.feature_names)
|
||||
X = self.scaler.transform(to_float_array(X_df))
|
||||
return self.feature_selector.transform(X) if self.feature_selector else X
|
||||
|
||||
def prepare_data(self):
|
||||
df = get_clean_data()
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
df = df.drop(columns=['ID'])
|
||||
|
||||
cols_to_drop = ['Weight', 'Height', 'Reason for absence']
|
||||
for col in cols_to_drop:
|
||||
if col in df.columns:
|
||||
df = df.drop(columns=[col])
|
||||
print(" Removed features: Weight, Height, Reason for absence (data leakage risk)")
|
||||
|
||||
df = normalize_columns(get_clean_data())
|
||||
df = prepare_modeling_dataframe(df)
|
||||
self.analyze_data(df)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Data Preprocessing")
|
||||
print("="*60)
|
||||
|
||||
numerical_cols = ['Age', 'Service time', 'Work load Average/day',
|
||||
'Transportation expense', 'Distance from Residence to Work',
|
||||
'Hit target', 'Body mass index']
|
||||
df = self.clip_outliers(df, numerical_cols)
|
||||
print(" Outliers clipped (1st-99th percentile)")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Data Augmentation")
|
||||
print("="*60)
|
||||
|
||||
df = self.augmenter.smote_regression(df)
|
||||
df = self.augmenter.augment(df)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Feature Engineering")
|
||||
print("="*60)
|
||||
|
||||
df = self.feature_engineering(df)
|
||||
|
||||
y = df['Absenteeism time in hours'].values
|
||||
X_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
|
||||
ordinal_cols = ['Month of absence', 'Day of the week', 'Seasons',
|
||||
'Disciplinary failure', 'Education', 'Social drinker',
|
||||
'Social smoker', 'age_group', 'service_group',
|
||||
'bmi_category', 'workload_category', 'commute_category',
|
||||
'experience_level']
|
||||
|
||||
for col in ordinal_cols:
|
||||
if col in X_df.columns:
|
||||
le = LabelEncoder()
|
||||
X_df[col] = le.fit_transform(X_df[col].astype(str))
|
||||
self.label_encoders[col] = le
|
||||
|
||||
self.feature_names = list(X_df.columns)
|
||||
|
||||
X = X_df.values.astype(float)
|
||||
|
||||
X = self.scaler.fit_transform(X)
|
||||
|
||||
X = self.select_features(X, y, k=20)
|
||||
|
||||
print(f"\nFinal feature count: {X.shape[1]}")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
|
||||
target_bins = make_target_bins(df[TARGET_COLUMN].values)
|
||||
train_df, test_df = train_test_split(
|
||||
df,
|
||||
test_size=config.TEST_SIZE,
|
||||
random_state=config.RANDOM_STATE,
|
||||
stratify=target_bins,
|
||||
)
|
||||
|
||||
train_df = train_df.reset_index(drop=True)
|
||||
test_df = test_df.reset_index(drop=True)
|
||||
|
||||
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
|
||||
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
|
||||
test_df = apply_outlier_bounds(test_df, self.outlier_bounds)
|
||||
|
||||
train_df = engineer_features(train_df)
|
||||
test_df = engineer_features(test_df)
|
||||
X_train_df, y_train = extract_xy(train_df)
|
||||
X_test_df, y_test = extract_xy(test_df)
|
||||
|
||||
X_train_df, self.label_encoders = fit_label_encoders(X_train_df, ORDINAL_COLUMNS)
|
||||
X_test_df = apply_label_encoders(X_test_df, self.label_encoders)
|
||||
|
||||
self.feature_names = list(X_train_df.columns)
|
||||
X_test_df = align_feature_frame(X_test_df, self.feature_names)
|
||||
X_train = self.scaler.fit_transform(to_float_array(X_train_df))
|
||||
X_test = self.scaler.transform(to_float_array(X_test_df))
|
||||
|
||||
transformed_target = self.transform_target(y_train)
|
||||
X_train = self.select_features(X_train, transformed_target, k=self.feature_k)
|
||||
X_test = self.transform_features(X_test_df)
|
||||
|
||||
self.training_metadata = {
|
||||
'train_samples': int(len(train_df)),
|
||||
'test_samples': int(len(test_df)),
|
||||
'feature_count_before_selection': int(len(self.feature_names)),
|
||||
'feature_count_after_selection': int(X_train.shape[1]),
|
||||
'training_date': datetime.now().strftime('%Y-%m-%d'),
|
||||
'target_transform': self.target_transform,
|
||||
'available_models': list(self.enabled_models),
|
||||
}
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def _run_search(self, name, estimator, params, X_train, y_train, n_iter=12):
|
||||
start_time = time.time()
|
||||
search = RandomizedSearchCV(
|
||||
estimator,
|
||||
param_distributions=params,
|
||||
n_iter=n_iter,
|
||||
cv=4,
|
||||
scoring='r2',
|
||||
n_jobs=-1,
|
||||
random_state=config.RANDOM_STATE,
|
||||
)
|
||||
search.fit(X_train, y_train)
|
||||
self.models[name] = search.best_estimator_
|
||||
print_training_log(name, start_time, search.best_score_, search.best_params_, n_iter, 4)
|
||||
|
||||
def train_random_forest(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Random Forest")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 15, 20, 25],
|
||||
'min_samples_split': [2, 5, 10],
|
||||
'min_samples_leaf': [1, 2, 4],
|
||||
'max_features': ['sqrt', 0.7]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
rf, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'random_forest',
|
||||
RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 14, 18, None],
|
||||
'min_samples_split': [2, 4, 8],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
'max_features': ['sqrt', 0.7],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['random_forest'] = random_search.best_estimator_
|
||||
print_training_log("Random Forest", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_xgboost(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training XGBoost")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [5, 7, 9],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'colsample_bytree': [0.7, 0.8],
|
||||
'min_child_weight': [1, 3],
|
||||
'reg_alpha': [0, 0.1],
|
||||
'reg_lambda': [1, 1.5]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
xgb_model, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['xgboost'] = random_search.best_estimator_
|
||||
print_training_log("XGBoost", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_lightgbm(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training LightGBM")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [7, 9, 11, -1],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'colsample_bytree': [0.7, 0.8],
|
||||
'min_child_samples': [5, 10, 20],
|
||||
'reg_alpha': [0, 0.1],
|
||||
'reg_lambda': [1, 1.5],
|
||||
'num_leaves': [31, 50, 70]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
lgb_model, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['lightgbm'] = random_search.best_estimator_
|
||||
print_training_log("LightGBM", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
|
||||
def train_gradient_boosting(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Gradient Boosting")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
gb = GradientBoostingRegressor(random_state=42)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300],
|
||||
'max_depth': [5, 7, 9],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'min_samples_split': [2, 5],
|
||||
'min_samples_leaf': [1, 2]
|
||||
}
|
||||
|
||||
print(f" Searching {15*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
gb, param_distributions, n_iter=15, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'gradient_boosting',
|
||||
GradientBoostingRegressor(random_state=config.RANDOM_STATE),
|
||||
{
|
||||
'n_estimators': [160, 220, 300],
|
||||
'max_depth': [3, 4, 5],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'min_samples_split': [2, 4, 6],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['gradient_boosting'] = random_search.best_estimator_
|
||||
print_training_log("Gradient Boosting", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 15, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
|
||||
def train_extra_trees(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Extra Trees")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
et = ExtraTreesRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 15, 20],
|
||||
'min_samples_split': [2, 5, 10],
|
||||
'min_samples_leaf': [1, 2, 4],
|
||||
'max_features': ['sqrt', 0.7]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
et, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'extra_trees',
|
||||
ExtraTreesRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [220, 320, 420],
|
||||
'max_depth': [10, 15, 20, None],
|
||||
'min_samples_split': [2, 4, 8],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
'max_features': ['sqrt', 0.7],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['extra_trees'] = random_search.best_estimator_
|
||||
print_training_log("Extra Trees", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_stacking(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Stacking Ensemble")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
base_estimators = []
|
||||
|
||||
if 'random_forest' in self.models:
|
||||
base_estimators.append(('rf', self.models['random_forest']))
|
||||
if 'xgboost' in self.models:
|
||||
base_estimators.append(('xgb', self.models['xgboost']))
|
||||
if 'lightgbm' in self.models:
|
||||
base_estimators.append(('lgb', self.models['lightgbm']))
|
||||
if 'gradient_boosting' in self.models:
|
||||
base_estimators.append(('gb', self.models['gradient_boosting']))
|
||||
|
||||
if len(base_estimators) < 2:
|
||||
print(" Not enough base models for stacking")
|
||||
return None
|
||||
|
||||
print(f" Base estimators: {[name for name, _ in base_estimators]}")
|
||||
print(f" Meta learner: Ridge")
|
||||
print(f" CV folds: 5")
|
||||
|
||||
stacking = StackingRegressor(
|
||||
estimators=base_estimators,
|
||||
final_estimator=Ridge(alpha=1.0),
|
||||
cv=5,
|
||||
n_jobs=-1
|
||||
|
||||
def train_lightgbm(self, X_train, y_train):
|
||||
if lgb is None:
|
||||
return
|
||||
self._run_search(
|
||||
'lightgbm',
|
||||
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [7, 9, -1],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'num_leaves': [31, 50, 70],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
stacking.fit(X_train, y_train)
|
||||
|
||||
self.models['stacking'] = stacking
|
||||
elapsed = time.time() - start_time
|
||||
print(f" {'─'*50}")
|
||||
print(f" Stacking ensemble created in {elapsed:.1f}s")
|
||||
print(f" {'─'*50}")
|
||||
|
||||
return stacking
|
||||
|
||||
|
||||
def train_xgboost(self, X_train, y_train):
|
||||
if xgb is None:
|
||||
return
|
||||
self._run_search(
|
||||
'xgboost',
|
||||
xgb.XGBRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [4, 6, 8],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'min_child_weight': [1, 3, 5],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
|
||||
def evaluate_model(self, model, X_test, y_test):
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
y_pred = self.inverse_transform_target(model.predict(X_test))
|
||||
y_pred = np.clip(y_pred, a_min=0, a_max=None)
|
||||
mse = mean_squared_error(y_test, y_pred)
|
||||
rmse = np.sqrt(mse)
|
||||
mae = mean_absolute_error(y_test, y_pred)
|
||||
|
||||
return {
|
||||
'r2': round(r2, 4),
|
||||
'r2': round(r2_score(y_test, y_pred), 4),
|
||||
'mse': round(mse, 4),
|
||||
'rmse': round(rmse, 4),
|
||||
'mae': round(mae, 4)
|
||||
'rmse': round(np.sqrt(mse), 4),
|
||||
'mae': round(mean_absolute_error(y_test, y_pred), 4),
|
||||
}
|
||||
|
||||
|
||||
def save_models(self):
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
|
||||
for name, model in self.models.items():
|
||||
if model is not None:
|
||||
model_path = os.path.join(config.MODELS_DIR, f'{name}_model.pkl')
|
||||
joblib.dump(model, model_path)
|
||||
print(f" {name} saved")
|
||||
|
||||
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
|
||||
joblib.dump(self.scaler, config.SCALER_PATH)
|
||||
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
||||
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
|
||||
joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
|
||||
joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
|
||||
print(" Scaler and feature info saved")
|
||||
|
||||
joblib.dump(self.training_metadata, os.path.join(config.MODELS_DIR, 'training_metadata.pkl'))
|
||||
|
||||
def train_all(self):
|
||||
total_start = time.time()
|
||||
print("\n" + "="*60)
|
||||
print("Optimized Model Training Started")
|
||||
print("="*60)
|
||||
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print('\nOptimized Model Training Started')
|
||||
X_train, X_test, y_train, y_test = self.prepare_data()
|
||||
|
||||
print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Training Models with Hyperparameter Optimization")
|
||||
print("="*60)
|
||||
|
||||
self.train_random_forest(X_train, y_train)
|
||||
self.train_extra_trees(X_train, y_train)
|
||||
self.train_xgboost(X_train, y_train)
|
||||
self.train_lightgbm(X_train, y_train)
|
||||
self.train_gradient_boosting(X_train, y_train)
|
||||
self.train_stacking(X_train, y_train)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Evaluating Models on Test Set")
|
||||
print("="*60)
|
||||
|
||||
best_r2 = -float('inf')
|
||||
best_model = None
|
||||
|
||||
y_train_transformed = self.transform_target(y_train)
|
||||
|
||||
if 'random_forest' in self.enabled_models:
|
||||
self.train_random_forest(X_train, y_train_transformed)
|
||||
if 'gradient_boosting' in self.enabled_models:
|
||||
self.train_gradient_boosting(X_train, y_train_transformed)
|
||||
if 'extra_trees' in self.enabled_models:
|
||||
self.train_extra_trees(X_train, y_train_transformed)
|
||||
if 'lightgbm' in self.enabled_models:
|
||||
self.train_lightgbm(X_train, y_train_transformed)
|
||||
if 'xgboost' in self.enabled_models:
|
||||
self.train_xgboost(X_train, y_train_transformed)
|
||||
|
||||
for name, model in self.models.items():
|
||||
if model is not None:
|
||||
metrics = self.evaluate_model(model, X_test, y_test)
|
||||
self.model_metrics[name] = metrics
|
||||
|
||||
status = "Good" if metrics['r2'] > 0.5 else ("OK" if metrics['r2'] > 0.3 else "Poor")
|
||||
status_icon = "✓" if status == "Good" else ("△" if status == "OK" else "✗")
|
||||
print(f" {status_icon} {name:20s} - R2: {metrics['r2']:.4f}, RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}")
|
||||
|
||||
if metrics['r2'] > best_r2:
|
||||
best_r2 = metrics['r2']
|
||||
best_model = name
|
||||
|
||||
print(f"\n ★ Best Model: {best_model} (R2 = {best_r2:.4f})")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Saving Models")
|
||||
print("="*60)
|
||||
metrics = self.evaluate_model(model, X_test, y_test)
|
||||
self.model_metrics[name] = metrics
|
||||
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
|
||||
|
||||
self.save_models()
|
||||
|
||||
return self.model_metrics
|
||||
|
||||
|
||||
def train_and_save_models():
|
||||
total_start = time.time()
|
||||
start = time.time()
|
||||
trainer = OptimizedModelTrainer()
|
||||
metrics = trainer.train_all()
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Training Complete!")
|
||||
print("="*60)
|
||||
print(f"Total training time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} min)")
|
||||
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("Final Model Ranking (by R2)")
|
||||
print("-"*60)
|
||||
|
||||
sorted_metrics = sorted(metrics.items(), key=lambda x: x[1]['r2'], reverse=True)
|
||||
for i, (name, m) in enumerate(sorted_metrics, 1):
|
||||
medal = "🥇" if i == 1 else ("🥈" if i == 2 else ("🥉" if i == 3 else " "))
|
||||
print(f" {medal} {i}. {name:20s} - R2: {m['r2']:.4f}, RMSE: {m['rmse']:.4f}")
|
||||
|
||||
print(f'\nTraining Complete in {time.time() - start:.1f}s')
|
||||
for idx, (name, metric) in enumerate(sorted(metrics.items(), key=lambda item: item[1]['r2'], reverse=True), start=1):
|
||||
print(f'{idx}. {name} - R2={metric["r2"]:.4f}')
|
||||
return metrics
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user