import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler import joblib import os import config class DataPreprocessor: def __init__(self): self.scaler = StandardScaler() self.is_fitted = False self.feature_names = None def load_raw_data(self): df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR) df.columns = df.columns.str.strip() return df def clean_data(self, df): df = df.copy() df = df.drop_duplicates() for col in df.columns: if df[col].isnull().sum() > 0: if df[col].dtype in ['int64', 'float64']: df[col].fillna(df[col].median(), inplace=True) else: df[col].fillna(df[col].mode()[0], inplace=True) return df def fit_transform(self, df): df = self.clean_data(df) if 'Absenteeism time in hours' in df.columns: y = df['Absenteeism time in hours'].values feature_df = df.drop(columns=['Absenteeism time in hours']) else: y = None feature_df = df self.feature_names = list(feature_df.columns) X = feature_df.values X = self.scaler.fit_transform(X) self.is_fitted = True return X, y def transform(self, df): if not self.is_fitted: raise ValueError("Preprocessor has not been fitted yet.") df = self.clean_data(df) if 'Absenteeism time in hours' in df.columns: feature_df = df.drop(columns=['Absenteeism time in hours']) else: feature_df = df X = feature_df.values X = self.scaler.transform(X) return X def save_preprocessor(self): os.makedirs(config.MODELS_DIR, exist_ok=True) joblib.dump(self.scaler, config.SCALER_PATH) joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl')) def load_preprocessor(self): self.scaler = joblib.load(config.SCALER_PATH) feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl') if os.path.exists(feature_names_path): self.feature_names = joblib.load(feature_names_path) self.is_fitted = True def get_clean_data(): preprocessor = DataPreprocessor() df = preprocessor.load_raw_data() df = preprocessor.clean_data(df) return df def save_clean_data(): preprocessor = DataPreprocessor() df = preprocessor.load_raw_data() df = preprocessor.clean_data(df) os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True) df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',') return df if __name__ == '__main__': df = save_clean_data() print(f"Clean data saved. Shape: {df.shape}") print(df.head())