import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import pandas as pd import numpy as np import time from sklearn.ensemble import ( RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor ) from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.preprocessing import RobustScaler, LabelEncoder from sklearn.feature_selection import SelectKBest, f_regression from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error import xgboost as xgb import lightgbm as lgb import joblib import warnings warnings.filterwarnings('ignore') import config from core.preprocessing import get_clean_data def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds): elapsed = time.time() - start_time print(f" {'─'*50}") print(f" Model: {model_name}") print(f" Time: {elapsed:.1f}s") print(f" Best CV R2: {best_score:.4f}") print(f" Best params:") for k, v in best_params.items(): print(f" - {k}: {v}") print(f" Iterations: {n_iter}, CV folds: {cv_folds}") print(f" {'─'*50}") class DataAugmenter: def __init__(self, noise_level=0.02, n_augment=2): self.noise_level = noise_level self.n_augment = n_augment def augment(self, df, target_col='Absenteeism time in hours'): print(f"\nData Augmentation...") print(f" Original size: {len(df)}") augmented_dfs = [df] numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() if target_col in numerical_cols: numerical_cols.remove(target_col) for i in range(self.n_augment): df_aug = df.copy() for col in numerical_cols: if col in df_aug.columns: std_val = df_aug[col].std() if std_val > 0: noise = np.random.normal(0, self.noise_level * std_val, len(df_aug)) df_aug[col] = df_aug[col] + noise augmented_dfs.append(df_aug) df_result = pd.concat(augmented_dfs, ignore_index=True) print(f" Augmented size: {len(df_result)}") return df_result def smote_regression(self, df, target_col='Absenteeism time in hours'): df = df.copy() y = df[target_col].values bins = [0, 1, 4, 8, 100] labels = ['zero', 'low', 'medium', 'high'] df['_target_bin'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True) bin_counts = df['_target_bin'].value_counts() max_count = bin_counts.max() numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() if target_col in numerical_cols: numerical_cols.remove(target_col) if '_target_bin' in numerical_cols: numerical_cols.remove('_target_bin') augmented_rows = [] for bin_label in labels: bin_df = df[df['_target_bin'] == bin_label].drop(columns=['_target_bin']) bin_size = len(bin_df) if bin_size < max_count and bin_size > 0: n_samples_to_add = max_count - bin_size for _ in range(n_samples_to_add): idx = np.random.choice(bin_df.index) sample = bin_df.loc[idx].copy() for col in numerical_cols: if col in sample.index: std_val = bin_df[col].std() if std_val > 0: noise = np.random.normal(0, 0.02 * std_val) sample[col] = sample[col] + noise augmented_rows.append(sample) if augmented_rows: df_aug = pd.DataFrame(augmented_rows) df_result = pd.concat([df.drop(columns=['_target_bin']), df_aug], ignore_index=True) else: df_result = df.drop(columns=['_target_bin']) print(f" After SMOTE-like augmentation: {len(df_result)}") return df_result class OptimizedModelTrainer: def __init__(self): self.models = {} self.scaler = RobustScaler() self.feature_names = None self.selected_features = None self.label_encoders = {} self.model_metrics = {} self.augmenter = DataAugmenter(noise_level=0.02, n_augment=2) def analyze_data(self, df): print("\n" + "="*60) print("Data Analysis") print("="*60) y = df['Absenteeism time in hours'] print(f"\nTarget variable statistics:") print(f" Min: {y.min()}") print(f" Max: {y.max()}") print(f" Mean: {y.mean():.2f}") print(f" Median: {y.median():.2f}") print(f" Std: {y.std():.2f}") print(f" Skewness: {y.skew():.2f}") print(f"\nTarget distribution:") print(f" Zero values: {(y == 0).sum()} ({(y == 0).sum() / len(y) * 100:.1f}%)") print(f" 1-8 hours: {((y > 0) & (y <= 8)).sum()} ({((y > 0) & (y <= 8)).sum() / len(y) * 100:.1f}%)") print(f" >8 hours: {(y > 8).sum()} ({(y > 8).sum() / len(y) * 100:.1f}%)") return y def clip_outliers(self, df, columns, lower_pct=1, upper_pct=99): df_clean = df.copy() for col in columns: if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']: if col == 'Absenteeism time in hours': continue lower = df_clean[col].quantile(lower_pct / 100) upper = df_clean[col].quantile(upper_pct / 100) df_clean[col] = df_clean[col].clip(lower, upper) return df_clean def feature_engineering(self, df): df = df.copy() df['workload_per_age'] = df['Work load Average/day'] / (df['Age'] + 1) df['expense_per_distance'] = df['Transportation expense'] / (df['Distance from Residence to Work'] + 1) df['age_service_ratio'] = df['Age'] / (df['Service time'] + 1) df['has_children'] = (df['Son'] > 0).astype(int) df['has_pet'] = (df['Pet'] > 0).astype(int) df['family_responsibility'] = df['Son'] + df['Pet'] df['health_risk'] = ((df['Social drinker'] == 1) | (df['Social smoker'] == 1) | (df['Body mass index'] > 30)).astype(int) df['lifestyle_risk'] = df['Social drinker'].astype(int) + df['Social smoker'].astype(int) df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=[1, 2, 3, 4]) df['service_group'] = pd.cut(df['Service time'], bins=[0, 5, 10, 20, 100], labels=[1, 2, 3, 4]) df['bmi_category'] = pd.cut(df['Body mass index'], bins=[0, 18.5, 25, 30, 100], labels=[1, 2, 3, 4]) df['workload_category'] = pd.cut(df['Work load Average/day'], bins=[0, 200, 250, 300, 500], labels=[1, 2, 3, 4]) df['commute_category'] = pd.cut(df['Distance from Residence to Work'], bins=[0, 10, 20, 50, 100], labels=[1, 2, 3, 4]) df['seasonal_risk'] = df['Seasons'].apply(lambda x: 1 if x in [1, 3] else 0) df['weekday_risk'] = df['Day of the week'].apply(lambda x: 1 if x in [2, 6] else 0) df['hit_target_ratio'] = df['Hit target'] / 100 df['experience_level'] = pd.cut(df['Service time'], bins=[0, 5, 10, 15, 100], labels=[1, 2, 3, 4]) df['age_workload_interaction'] = df['Age'] * df['Work load Average/day'] / 10000 df['service_bmi_interaction'] = df['Service time'] * df['Body mass index'] / 100 return df def select_features(self, X, y, k=20): print("\nFeature Selection...") selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1])) selector.fit(X, y) scores = selector.scores_ feature_scores = list(zip(self.feature_names, scores)) feature_scores.sort(key=lambda x: x[1], reverse=True) print(f"\nTop {min(k, len(feature_scores))} features by F-score:") for i, (name, score) in enumerate(feature_scores[:min(k, len(feature_scores))]): cn = config.FEATURE_NAME_CN.get(name, name) print(f" {i+1}. {cn}: {score:.2f}") selected_mask = selector.get_support() self.selected_features = [f for f, s in zip(self.feature_names, selected_mask) if s] return selector.transform(X) def prepare_data(self): df = get_clean_data() df.columns = [col.strip() for col in df.columns] df = df.drop(columns=['ID']) cols_to_drop = ['Weight', 'Height', 'Reason for absence'] for col in cols_to_drop: if col in df.columns: df = df.drop(columns=[col]) print(" Removed features: Weight, Height, Reason for absence (data leakage risk)") self.analyze_data(df) print("\n" + "="*60) print("Data Preprocessing") print("="*60) numerical_cols = ['Age', 'Service time', 'Work load Average/day', 'Transportation expense', 'Distance from Residence to Work', 'Hit target', 'Body mass index'] df = self.clip_outliers(df, numerical_cols) print(" Outliers clipped (1st-99th percentile)") print("\n" + "="*60) print("Data Augmentation") print("="*60) df = self.augmenter.smote_regression(df) df = self.augmenter.augment(df) print("\n" + "="*60) print("Feature Engineering") print("="*60) df = self.feature_engineering(df) y = df['Absenteeism time in hours'].values X_df = df.drop(columns=['Absenteeism time in hours']) ordinal_cols = ['Month of absence', 'Day of the week', 'Seasons', 'Disciplinary failure', 'Education', 'Social drinker', 'Social smoker', 'age_group', 'service_group', 'bmi_category', 'workload_category', 'commute_category', 'experience_level'] for col in ordinal_cols: if col in X_df.columns: le = LabelEncoder() X_df[col] = le.fit_transform(X_df[col].astype(str)) self.label_encoders[col] = le self.feature_names = list(X_df.columns) X = X_df.values.astype(float) X = self.scaler.fit_transform(X) X = self.select_features(X, y, k=20) print(f"\nFinal feature count: {X.shape[1]}") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) return X_train, X_test, y_train, y_test def train_random_forest(self, X_train, y_train): print("\n" + "="*60) print("Training Random Forest") print("="*60) start_time = time.time() rf = RandomForestRegressor(random_state=42, n_jobs=-1) param_distributions = { 'n_estimators': [200, 300, 400], 'max_depth': [10, 15, 20, 25], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 0.7] } print(f" Searching {20*5} parameter combinations...") random_search = RandomizedSearchCV( rf, param_distributions, n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42 ) random_search.fit(X_train, y_train) self.models['random_forest'] = random_search.best_estimator_ print_training_log("Random Forest", start_time, random_search.best_score_, random_search.best_params_, 20, 5) return random_search.best_estimator_ def train_xgboost(self, X_train, y_train): print("\n" + "="*60) print("Training XGBoost") print("="*60) start_time = time.time() xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1) param_distributions = { 'n_estimators': [200, 300, 400], 'max_depth': [5, 7, 9], 'learning_rate': [0.05, 0.1], 'subsample': [0.7, 0.8], 'colsample_bytree': [0.7, 0.8], 'min_child_weight': [1, 3], 'reg_alpha': [0, 0.1], 'reg_lambda': [1, 1.5] } print(f" Searching {20*5} parameter combinations...") random_search = RandomizedSearchCV( xgb_model, param_distributions, n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42 ) random_search.fit(X_train, y_train) self.models['xgboost'] = random_search.best_estimator_ print_training_log("XGBoost", start_time, random_search.best_score_, random_search.best_params_, 20, 5) return random_search.best_estimator_ def train_lightgbm(self, X_train, y_train): print("\n" + "="*60) print("Training LightGBM") print("="*60) start_time = time.time() lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1) param_distributions = { 'n_estimators': [200, 300, 400], 'max_depth': [7, 9, 11, -1], 'learning_rate': [0.05, 0.1], 'subsample': [0.7, 0.8], 'colsample_bytree': [0.7, 0.8], 'min_child_samples': [5, 10, 20], 'reg_alpha': [0, 0.1], 'reg_lambda': [1, 1.5], 'num_leaves': [31, 50, 70] } print(f" Searching {20*5} parameter combinations...") random_search = RandomizedSearchCV( lgb_model, param_distributions, n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42 ) random_search.fit(X_train, y_train) self.models['lightgbm'] = random_search.best_estimator_ print_training_log("LightGBM", start_time, random_search.best_score_, random_search.best_params_, 20, 5) return random_search.best_estimator_ def train_gradient_boosting(self, X_train, y_train): print("\n" + "="*60) print("Training Gradient Boosting") print("="*60) start_time = time.time() gb = GradientBoostingRegressor(random_state=42) param_distributions = { 'n_estimators': [200, 300], 'max_depth': [5, 7, 9], 'learning_rate': [0.05, 0.1], 'subsample': [0.7, 0.8], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2] } print(f" Searching {15*5} parameter combinations...") random_search = RandomizedSearchCV( gb, param_distributions, n_iter=15, cv=5, scoring='r2', n_jobs=-1, random_state=42 ) random_search.fit(X_train, y_train) self.models['gradient_boosting'] = random_search.best_estimator_ print_training_log("Gradient Boosting", start_time, random_search.best_score_, random_search.best_params_, 15, 5) return random_search.best_estimator_ def train_extra_trees(self, X_train, y_train): print("\n" + "="*60) print("Training Extra Trees") print("="*60) start_time = time.time() et = ExtraTreesRegressor(random_state=42, n_jobs=-1) param_distributions = { 'n_estimators': [200, 300, 400], 'max_depth': [10, 15, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'max_features': ['sqrt', 0.7] } print(f" Searching {20*5} parameter combinations...") random_search = RandomizedSearchCV( et, param_distributions, n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42 ) random_search.fit(X_train, y_train) self.models['extra_trees'] = random_search.best_estimator_ print_training_log("Extra Trees", start_time, random_search.best_score_, random_search.best_params_, 20, 5) return random_search.best_estimator_ def train_stacking(self, X_train, y_train): print("\n" + "="*60) print("Training Stacking Ensemble") print("="*60) start_time = time.time() base_estimators = [] if 'random_forest' in self.models: base_estimators.append(('rf', self.models['random_forest'])) if 'xgboost' in self.models: base_estimators.append(('xgb', self.models['xgboost'])) if 'lightgbm' in self.models: base_estimators.append(('lgb', self.models['lightgbm'])) if 'gradient_boosting' in self.models: base_estimators.append(('gb', self.models['gradient_boosting'])) if len(base_estimators) < 2: print(" Not enough base models for stacking") return None print(f" Base estimators: {[name for name, _ in base_estimators]}") print(f" Meta learner: Ridge") print(f" CV folds: 5") stacking = StackingRegressor( estimators=base_estimators, final_estimator=Ridge(alpha=1.0), cv=5, n_jobs=-1 ) stacking.fit(X_train, y_train) self.models['stacking'] = stacking elapsed = time.time() - start_time print(f" {'─'*50}") print(f" Stacking ensemble created in {elapsed:.1f}s") print(f" {'─'*50}") return stacking def evaluate_model(self, model, X_test, y_test): y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) return { 'r2': round(r2, 4), 'mse': round(mse, 4), 'rmse': round(rmse, 4), 'mae': round(mae, 4) } def save_models(self): os.makedirs(config.MODELS_DIR, exist_ok=True) for name, model in self.models.items(): if model is not None: model_path = os.path.join(config.MODELS_DIR, f'{name}_model.pkl') joblib.dump(model, model_path) print(f" {name} saved") joblib.dump(self.scaler, config.SCALER_PATH) joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl')) joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl')) joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl')) joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl')) print(" Scaler and feature info saved") def train_all(self): total_start = time.time() print("\n" + "="*60) print("Optimized Model Training Started") print("="*60) print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}") X_train, X_test, y_train, y_test = self.prepare_data() print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}") print("\n" + "="*60) print("Training Models with Hyperparameter Optimization") print("="*60) self.train_random_forest(X_train, y_train) self.train_extra_trees(X_train, y_train) self.train_xgboost(X_train, y_train) self.train_lightgbm(X_train, y_train) self.train_gradient_boosting(X_train, y_train) self.train_stacking(X_train, y_train) print("\n" + "="*60) print("Evaluating Models on Test Set") print("="*60) best_r2 = -float('inf') best_model = None for name, model in self.models.items(): if model is not None: metrics = self.evaluate_model(model, X_test, y_test) self.model_metrics[name] = metrics status = "Good" if metrics['r2'] > 0.5 else ("OK" if metrics['r2'] > 0.3 else "Poor") status_icon = "✓" if status == "Good" else ("△" if status == "OK" else "✗") print(f" {status_icon} {name:20s} - R2: {metrics['r2']:.4f}, RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}") if metrics['r2'] > best_r2: best_r2 = metrics['r2'] best_model = name print(f"\n ★ Best Model: {best_model} (R2 = {best_r2:.4f})") print("\n" + "="*60) print("Saving Models") print("="*60) self.save_models() return self.model_metrics def train_and_save_models(): total_start = time.time() trainer = OptimizedModelTrainer() metrics = trainer.train_all() total_elapsed = time.time() - total_start print("\n" + "="*60) print("Training Complete!") print("="*60) print(f"Total training time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} min)") print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}") print("\n" + "-"*60) print("Final Model Ranking (by R2)") print("-"*60) sorted_metrics = sorted(metrics.items(), key=lambda x: x[1]['r2'], reverse=True) for i, (name, m) in enumerate(sorted_metrics, 1): medal = "🥇" if i == 1 else ("🥈" if i == 2 else ("🥉" if i == 3 else " ")) print(f" {medal} {i}. {name:20s} - R2: {m['r2']:.4f}, RMSE: {m['rmse']:.4f}") return metrics if __name__ == '__main__': train_and_save_models()