import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import (
    RandomForestRegressor, 
    GradientBoostingRegressor,
    ExtraTreesRegressor,
    StackingRegressor
)
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

import config
from core.preprocessing import get_clean_data


def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
    elapsed = time.time() - start_time
    print(f"  {'─'*50}")
    print(f"  Model: {model_name}")
    print(f"  Time: {elapsed:.1f}s")
    print(f"  Best CV R2: {best_score:.4f}")
    print(f"  Best params:")
    for k, v in best_params.items():
        print(f"    - {k}: {v}")
    print(f"  Iterations: {n_iter}, CV folds: {cv_folds}")
    print(f"  {'─'*50}")


class DataAugmenter:
    def __init__(self, noise_level=0.02, n_augment=2):
        self.noise_level = noise_level
        self.n_augment = n_augment
    
    def augment(self, df, target_col='Absenteeism time in hours'):
        print(f"\nData Augmentation...")
        print(f"  Original size: {len(df)}")
        
        augmented_dfs = [df]
        
        numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if target_col in numerical_cols:
            numerical_cols.remove(target_col)
        
        for i in range(self.n_augment):
            df_aug = df.copy()
            
            for col in numerical_cols:
                if col in df_aug.columns:
                    std_val = df_aug[col].std()
                    if std_val > 0:
                        noise = np.random.normal(0, self.noise_level * std_val, len(df_aug))
                        df_aug[col] = df_aug[col] + noise
            
            augmented_dfs.append(df_aug)
        
        df_result = pd.concat(augmented_dfs, ignore_index=True)
        print(f"  Augmented size: {len(df_result)}")
        
        return df_result
    
    def smote_regression(self, df, target_col='Absenteeism time in hours'):
        df = df.copy()
        y = df[target_col].values
        
        bins = [0, 1, 4, 8, 100]
        labels = ['zero', 'low', 'medium', 'high']
        df['_target_bin'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
        
        bin_counts = df['_target_bin'].value_counts()
        max_count = bin_counts.max()
        
        numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if target_col in numerical_cols:
            numerical_cols.remove(target_col)
        if '_target_bin' in numerical_cols:
            numerical_cols.remove('_target_bin')
        
        augmented_rows = []
        for bin_label in labels:
            bin_df = df[df['_target_bin'] == bin_label].drop(columns=['_target_bin'])
            bin_size = len(bin_df)
            
            if bin_size < max_count and bin_size > 0:
                n_samples_to_add = max_count - bin_size
                
                for _ in range(n_samples_to_add):
                    idx = np.random.choice(bin_df.index)
                    sample = bin_df.loc[idx].copy()
                    
                    for col in numerical_cols:
                        if col in sample.index:
                            std_val = bin_df[col].std()
                            if std_val > 0:
                                noise = np.random.normal(0, 0.02 * std_val)
                                sample[col] = sample[col] + noise
                    
                    augmented_rows.append(sample)
        
        if augmented_rows:
            df_aug = pd.DataFrame(augmented_rows)
            df_result = pd.concat([df.drop(columns=['_target_bin']), df_aug], ignore_index=True)
        else:
            df_result = df.drop(columns=['_target_bin'])
        
        print(f"  After SMOTE-like augmentation: {len(df_result)}")
        
        return df_result


class OptimizedModelTrainer:
    def __init__(self):
        self.models = {}
        self.scaler = RobustScaler()
        self.feature_names = None
        self.selected_features = None
        self.label_encoders = {}
        self.model_metrics = {}
        self.augmenter = DataAugmenter(noise_level=0.02, n_augment=2)
        
    def analyze_data(self, df):
        print("\n" + "="*60)
        print("Data Analysis")
        print("="*60)
        
        y = df['Absenteeism time in hours']
        
        print(f"\nTarget variable statistics:")
        print(f"  Min: {y.min()}")
        print(f"  Max: {y.max()}")
        print(f"  Mean: {y.mean():.2f}")
        print(f"  Median: {y.median():.2f}")
        print(f"  Std: {y.std():.2f}")
        print(f"  Skewness: {y.skew():.2f}")
        
        print(f"\nTarget distribution:")
        print(f"  Zero values: {(y == 0).sum()} ({(y == 0).sum() / len(y) * 100:.1f}%)")
        print(f"  1-8 hours: {((y > 0) & (y <= 8)).sum()} ({((y > 0) & (y <= 8)).sum() / len(y) * 100:.1f}%)")
        print(f"  >8 hours: {(y > 8).sum()} ({(y > 8).sum() / len(y) * 100:.1f}%)")
        
        return y
    
    def clip_outliers(self, df, columns, lower_pct=1, upper_pct=99):
        df_clean = df.copy()
        
        for col in columns:
            if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']:
                if col == 'Absenteeism time in hours':
                    continue
                lower = df_clean[col].quantile(lower_pct / 100)
                upper = df_clean[col].quantile(upper_pct / 100)
                df_clean[col] = df_clean[col].clip(lower, upper)
        
        return df_clean
    
    def feature_engineering(self, df):
        df = df.copy()
        
        df['workload_per_age'] = df['Work load Average/day'] / (df['Age'] + 1)
        df['expense_per_distance'] = df['Transportation expense'] / (df['Distance from Residence to Work'] + 1)
        df['age_service_ratio'] = df['Age'] / (df['Service time'] + 1)
        
        df['has_children'] = (df['Son'] > 0).astype(int)
        df['has_pet'] = (df['Pet'] > 0).astype(int)
        df['family_responsibility'] = df['Son'] + df['Pet']
        
        df['health_risk'] = ((df['Social drinker'] == 1) | (df['Social smoker'] == 1) | (df['Body mass index'] > 30)).astype(int)
        df['lifestyle_risk'] = df['Social drinker'].astype(int) + df['Social smoker'].astype(int)
        
        df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=[1, 2, 3, 4])
        df['service_group'] = pd.cut(df['Service time'], bins=[0, 5, 10, 20, 100], labels=[1, 2, 3, 4])
        df['bmi_category'] = pd.cut(df['Body mass index'], bins=[0, 18.5, 25, 30, 100], labels=[1, 2, 3, 4])
        
        df['workload_category'] = pd.cut(df['Work load Average/day'], bins=[0, 200, 250, 300, 500], labels=[1, 2, 3, 4])
        df['commute_category'] = pd.cut(df['Distance from Residence to Work'], bins=[0, 10, 20, 50, 100], labels=[1, 2, 3, 4])
        
        df['seasonal_risk'] = df['Seasons'].apply(lambda x: 1 if x in [1, 3] else 0)
        df['weekday_risk'] = df['Day of the week'].apply(lambda x: 1 if x in [2, 6] else 0)
        
        df['hit_target_ratio'] = df['Hit target'] / 100
        df['experience_level'] = pd.cut(df['Service time'], bins=[0, 5, 10, 15, 100], labels=[1, 2, 3, 4])
        
        df['age_workload_interaction'] = df['Age'] * df['Work load Average/day'] / 10000
        df['service_bmi_interaction'] = df['Service time'] * df['Body mass index'] / 100
        
        return df
    
    def select_features(self, X, y, k=20):
        print("\nFeature Selection...")
        
        selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
        selector.fit(X, y)
        
        scores = selector.scores_
        feature_scores = list(zip(self.feature_names, scores))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\nTop {min(k, len(feature_scores))} features by F-score:")
        for i, (name, score) in enumerate(feature_scores[:min(k, len(feature_scores))]):
            cn = config.FEATURE_NAME_CN.get(name, name)
            print(f"  {i+1}. {cn}: {score:.2f}")
        
        selected_mask = selector.get_support()
        self.selected_features = [f for f, s in zip(self.feature_names, selected_mask) if s]
        
        return selector.transform(X)
    
    def prepare_data(self):
        df = get_clean_data()
        df.columns = [col.strip() for col in df.columns]
        
        df = df.drop(columns=['ID'])
        
        cols_to_drop = ['Weight', 'Height', 'Reason for absence']
        for col in cols_to_drop:
            if col in df.columns:
                df = df.drop(columns=[col])
        print("  Removed features: Weight, Height, Reason for absence (data leakage risk)")
        
        self.analyze_data(df)
        
        print("\n" + "="*60)
        print("Data Preprocessing")
        print("="*60)
        
        numerical_cols = ['Age', 'Service time', 'Work load Average/day', 
                         'Transportation expense', 'Distance from Residence to Work',
                         'Hit target', 'Body mass index']
        df = self.clip_outliers(df, numerical_cols)
        print("  Outliers clipped (1st-99th percentile)")
        
        print("\n" + "="*60)
        print("Data Augmentation")
        print("="*60)
        
        df = self.augmenter.smote_regression(df)
        df = self.augmenter.augment(df)
        
        print("\n" + "="*60)
        print("Feature Engineering")
        print("="*60)
        
        df = self.feature_engineering(df)
        
        y = df['Absenteeism time in hours'].values
        X_df = df.drop(columns=['Absenteeism time in hours'])
        
        ordinal_cols = ['Month of absence', 'Day of the week', 'Seasons', 
                       'Disciplinary failure', 'Education', 'Social drinker', 
                       'Social smoker', 'age_group', 'service_group', 
                       'bmi_category', 'workload_category', 'commute_category',
                       'experience_level']
        
        for col in ordinal_cols:
            if col in X_df.columns:
                le = LabelEncoder()
                X_df[col] = le.fit_transform(X_df[col].astype(str))
                self.label_encoders[col] = le
        
        self.feature_names = list(X_df.columns)
        
        X = X_df.values.astype(float)
        
        X = self.scaler.fit_transform(X)
        
        X = self.select_features(X, y, k=20)
        
        print(f"\nFinal feature count: {X.shape[1]}")
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        return X_train, X_test, y_train, y_test
    
    def train_random_forest(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training Random Forest")
        print("="*60)
        
        start_time = time.time()
        rf = RandomForestRegressor(random_state=42, n_jobs=-1)
        
        param_distributions = {
            'n_estimators': [200, 300, 400],
            'max_depth': [10, 15, 20, 25],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 0.7]
        }
        
        print(f"  Searching {20*5} parameter combinations...")
        random_search = RandomizedSearchCV(
            rf, param_distributions, n_iter=20, cv=5, 
            scoring='r2', n_jobs=-1, random_state=42
        )
        random_search.fit(X_train, y_train)
        
        self.models['random_forest'] = random_search.best_estimator_
        print_training_log("Random Forest", start_time, random_search.best_score_, 
                          random_search.best_params_, 20, 5)
        
        return random_search.best_estimator_
    
    def train_xgboost(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training XGBoost")
        print("="*60)
        
        start_time = time.time()
        xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
        
        param_distributions = {
            'n_estimators': [200, 300, 400],
            'max_depth': [5, 7, 9],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.7, 0.8],
            'colsample_bytree': [0.7, 0.8],
            'min_child_weight': [1, 3],
            'reg_alpha': [0, 0.1],
            'reg_lambda': [1, 1.5]
        }
        
        print(f"  Searching {20*5} parameter combinations...")
        random_search = RandomizedSearchCV(
            xgb_model, param_distributions, n_iter=20, cv=5,
            scoring='r2', n_jobs=-1, random_state=42
        )
        random_search.fit(X_train, y_train)
        
        self.models['xgboost'] = random_search.best_estimator_
        print_training_log("XGBoost", start_time, random_search.best_score_,
                          random_search.best_params_, 20, 5)
        
        return random_search.best_estimator_
    
    def train_lightgbm(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training LightGBM")
        print("="*60)
        
        start_time = time.time()
        lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
        
        param_distributions = {
            'n_estimators': [200, 300, 400],
            'max_depth': [7, 9, 11, -1],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.7, 0.8],
            'colsample_bytree': [0.7, 0.8],
            'min_child_samples': [5, 10, 20],
            'reg_alpha': [0, 0.1],
            'reg_lambda': [1, 1.5],
            'num_leaves': [31, 50, 70]
        }
        
        print(f"  Searching {20*5} parameter combinations...")
        random_search = RandomizedSearchCV(
            lgb_model, param_distributions, n_iter=20, cv=5,
            scoring='r2', n_jobs=-1, random_state=42
        )
        random_search.fit(X_train, y_train)
        
        self.models['lightgbm'] = random_search.best_estimator_
        print_training_log("LightGBM", start_time, random_search.best_score_,
                          random_search.best_params_, 20, 5)
        
        return random_search.best_estimator_
    
    def train_gradient_boosting(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training Gradient Boosting")
        print("="*60)
        
        start_time = time.time()
        gb = GradientBoostingRegressor(random_state=42)
        
        param_distributions = {
            'n_estimators': [200, 300],
            'max_depth': [5, 7, 9],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.7, 0.8],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
        
        print(f"  Searching {15*5} parameter combinations...")
        random_search = RandomizedSearchCV(
            gb, param_distributions, n_iter=15, cv=5,
            scoring='r2', n_jobs=-1, random_state=42
        )
        random_search.fit(X_train, y_train)
        
        self.models['gradient_boosting'] = random_search.best_estimator_
        print_training_log("Gradient Boosting", start_time, random_search.best_score_,
                          random_search.best_params_, 15, 5)
        
        return random_search.best_estimator_
    
    def train_extra_trees(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training Extra Trees")
        print("="*60)
        
        start_time = time.time()
        et = ExtraTreesRegressor(random_state=42, n_jobs=-1)
        
        param_distributions = {
            'n_estimators': [200, 300, 400],
            'max_depth': [10, 15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 0.7]
        }
        
        print(f"  Searching {20*5} parameter combinations...")
        random_search = RandomizedSearchCV(
            et, param_distributions, n_iter=20, cv=5,
            scoring='r2', n_jobs=-1, random_state=42
        )
        random_search.fit(X_train, y_train)
        
        self.models['extra_trees'] = random_search.best_estimator_
        print_training_log("Extra Trees", start_time, random_search.best_score_,
                          random_search.best_params_, 20, 5)
        
        return random_search.best_estimator_
    
    def train_stacking(self, X_train, y_train):
        print("\n" + "="*60)
        print("Training Stacking Ensemble")
        print("="*60)
        
        start_time = time.time()
        base_estimators = []
        
        if 'random_forest' in self.models:
            base_estimators.append(('rf', self.models['random_forest']))
        if 'xgboost' in self.models:
            base_estimators.append(('xgb', self.models['xgboost']))
        if 'lightgbm' in self.models:
            base_estimators.append(('lgb', self.models['lightgbm']))
        if 'gradient_boosting' in self.models:
            base_estimators.append(('gb', self.models['gradient_boosting']))
        
        if len(base_estimators) < 2:
            print("  Not enough base models for stacking")
            return None
        
        print(f"  Base estimators: {[name for name, _ in base_estimators]}")
        print(f"  Meta learner: Ridge")
        print(f"  CV folds: 5")
        
        stacking = StackingRegressor(
            estimators=base_estimators,
            final_estimator=Ridge(alpha=1.0),
            cv=5,
            n_jobs=-1
        )
        stacking.fit(X_train, y_train)
        
        self.models['stacking'] = stacking
        elapsed = time.time() - start_time
        print(f"  {'─'*50}")
        print(f"  Stacking ensemble created in {elapsed:.1f}s")
        print(f"  {'─'*50}")
        
        return stacking
    
    def evaluate_model(self, model, X_test, y_test):
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        
        return {
            'r2': round(r2, 4),
            'mse': round(mse, 4),
            'rmse': round(rmse, 4),
            'mae': round(mae, 4)
        }
    
    def save_models(self):
        os.makedirs(config.MODELS_DIR, exist_ok=True)
        
        for name, model in self.models.items():
            if model is not None:
                model_path = os.path.join(config.MODELS_DIR, f'{name}_model.pkl')
                joblib.dump(model, model_path)
                print(f"  {name} saved")
        
        joblib.dump(self.scaler, config.SCALER_PATH)
        joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
        joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
        joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
        joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
        print("  Scaler and feature info saved")
    
    def train_all(self):
        total_start = time.time()
        print("\n" + "="*60)
        print("Optimized Model Training Started")
        print("="*60)
        print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        X_train, X_test, y_train, y_test = self.prepare_data()
        
        print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")
        
        print("\n" + "="*60)
        print("Training Models with Hyperparameter Optimization")
        print("="*60)
        
        self.train_random_forest(X_train, y_train)
        self.train_extra_trees(X_train, y_train)
        self.train_xgboost(X_train, y_train)
        self.train_lightgbm(X_train, y_train)
        self.train_gradient_boosting(X_train, y_train)
        self.train_stacking(X_train, y_train)
        
        print("\n" + "="*60)
        print("Evaluating Models on Test Set")
        print("="*60)
        
        best_r2 = -float('inf')
        best_model = None
        
        for name, model in self.models.items():
            if model is not None:
                metrics = self.evaluate_model(model, X_test, y_test)
                self.model_metrics[name] = metrics
                
                status = "Good" if metrics['r2'] > 0.5 else ("OK" if metrics['r2'] > 0.3 else "Poor")
                status_icon = "✓" if status == "Good" else ("△" if status == "OK" else "✗")
                print(f"  {status_icon} {name:20s} - R2: {metrics['r2']:.4f}, RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}")
                
                if metrics['r2'] > best_r2:
                    best_r2 = metrics['r2']
                    best_model = name
        
        print(f"\n  ★ Best Model: {best_model} (R2 = {best_r2:.4f})")
        
        print("\n" + "="*60)
        print("Saving Models")
        print("="*60)
        self.save_models()
        
        return self.model_metrics


def train_and_save_models():
    total_start = time.time()
    trainer = OptimizedModelTrainer()
    metrics = trainer.train_all()
    total_elapsed = time.time() - total_start
    
    print("\n" + "="*60)
    print("Training Complete!")
    print("="*60)
    print(f"Total training time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} min)")
    print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    print("\n" + "-"*60)
    print("Final Model Ranking (by R2)")
    print("-"*60)
    
    sorted_metrics = sorted(metrics.items(), key=lambda x: x[1]['r2'], reverse=True)
    for i, (name, m) in enumerate(sorted_metrics, 1):
        medal = "🥇" if i == 1 else ("🥈" if i == 2 else ("🥉" if i == 3 else "  "))
        print(f"  {medal} {i}. {name:20s} - R2: {m['r2']:.4f}, RMSE: {m['rmse']:.4f}")
    
    return metrics


if __name__ == '__main__':
    train_and_save_models()