import os import joblib import pandas as pd from sklearn.preprocessing import StandardScaler import config from core.generate_dataset import ensure_dataset class DataPreprocessor: def __init__(self): self.scaler = StandardScaler() self.is_fitted = False self.feature_names = None def load_raw_data(self): ensure_dataset() df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR) df.columns = df.columns.str.strip() return df def clean_data(self, df): df = df.copy() df = df.drop_duplicates() for col in df.columns: if df[col].isnull().sum() == 0: continue if pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(df[col].median()) else: df[col] = df[col].fillna(df[col].mode()[0]) return df def fit_transform(self, df): df = self.clean_data(df) if config.TARGET_COLUMN in df.columns: y = df[config.TARGET_COLUMN].values feature_df = df.drop(columns=[config.TARGET_COLUMN]) else: y = None feature_df = df self.feature_names = list(feature_df.columns) X = self.scaler.fit_transform(feature_df.values) self.is_fitted = True return X, y def transform(self, df): if not self.is_fitted: raise ValueError("Preprocessor has not been fitted yet.") df = self.clean_data(df) if config.TARGET_COLUMN in df.columns: feature_df = df.drop(columns=[config.TARGET_COLUMN]) else: feature_df = df return self.scaler.transform(feature_df.values) def save_preprocessor(self): os.makedirs(config.MODELS_DIR, exist_ok=True) joblib.dump(self.scaler, config.SCALER_PATH) joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl')) def load_preprocessor(self): self.scaler = joblib.load(config.SCALER_PATH) feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl') if os.path.exists(feature_names_path): self.feature_names = joblib.load(feature_names_path) self.is_fitted = True def get_clean_data(): preprocessor = DataPreprocessor() df = preprocessor.load_raw_data() return preprocessor.clean_data(df) def save_clean_data(): preprocessor = DataPreprocessor() df = preprocessor.load_raw_data() df = preprocessor.clean_data(df) os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True) df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',') return df if __name__ == '__main__': data = save_clean_data() print(f"Clean data saved. Shape: {data.shape}")