import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder import config TARGET_COLUMN = config.TARGET_COLUMN ID_COLUMN = config.EMPLOYEE_ID_COLUMN COMPANY_COLUMN = config.COMPANY_ID_COLUMN LEAKY_COLUMNS = [ID_COLUMN, COMPANY_COLUMN] ORDINAL_COLUMNS = [ '企业规模', '所在城市等级', '岗位级别', '最高学历', '绩效等级', '心理压力等级', '工龄分层', '年龄分层', '通勤分层', '加班分层', ] NUMERICAL_OUTLIER_COLUMNS = [ '年龄', '司龄年数', '月均加班时长', '近30天出勤天数', '近90天缺勤次数', '近180天请假总时长', '通勤时长分钟', '通勤距离公里', '团队人数', '直属上级管理跨度', 'BMI', '近30天睡眠时长均值', '每周运动频次', ] DEFAULT_PREDICTION_INPUT = { 'industry': '制造业', 'company_size': '1000-4999人', 'city_tier': '新一线', 'age': 31, 'tenure_years': 4.5, 'education_level': '本科', 'marital_status': '已婚', 'job_family': '专业技术', 'job_level': '中级', 'employment_type': '正式员工', 'shift_type': '标准白班', 'is_night_shift': 0, 'monthly_overtime_hours': 26, 'attendance_days_30d': 22, 'absence_count_90d': 1, 'leave_hours_180d': 18, 'commute_minutes': 42, 'commute_km': 18, 'cross_city_commute': 0, 'performance_level': 'B', 'disciplinary_count_12m': 0, 'team_size': 10, 'manager_span': 14, 'bmi': 24.5, 'chronic_disease_flag': 0, 'annual_check_abnormal_flag': 0, 'sleep_hours': 7.1, 'exercise_frequency': 2, 'smoking_flag': 0, 'drinking_flag': 0, 'stress_level': '中', 'sedentary_job_flag': 1, 'local_hukou_flag': 1, 'children_count': 1, 'single_child_burden_flag': 0, 'absence_month': 5, 'weekday': 2, 'near_holiday_flag': 0, 'leave_channel': '系统申请', 'leave_type': '病假', 'leave_reason_category': '身体不适', 'medical_certificate_flag': 1, 'urgent_leave_flag': 1, 'continuous_absence_flag': 0, 'previous_day_overtime_flag': 1, } def make_target_bins(y): y_series = pd.Series(y) bins = pd.cut( y_series, bins=[0, 4, 8, 12, np.inf], labels=['low', 'medium', 'high', 'extreme'], include_lowest=True, ) return bins.astype(str) def normalize_columns(df): df = df.copy() df.columns = [col.strip() for col in df.columns] return df def prepare_modeling_dataframe(df): df = normalize_columns(df) drop_cols = [col for col in LEAKY_COLUMNS if col in df.columns] if drop_cols: df = df.drop(columns=drop_cols) return df def fit_outlier_bounds(df, columns, lower_pct=1, upper_pct=99): bounds = {} for col in columns: if col in df.columns and pd.api.types.is_numeric_dtype(df[col]): bounds[col] = ( float(df[col].quantile(lower_pct / 100)), float(df[col].quantile(upper_pct / 100)), ) return bounds def apply_outlier_bounds(df, bounds): df = df.copy() for col, (lower, upper) in bounds.items(): if col in df.columns: df[col] = df[col].clip(lower, upper) return df def engineer_features(df): df = df.copy() df['加班通勤压力指数'] = ( df['月均加班时长'] * 0.45 + df['通勤时长分钟'] * 0.35 + df['是否夜班岗位'] * 12 + df['前一工作日是否加班'] * 6 ) / 10 df['家庭负担指数'] = ( df['子女数量'] * 1.2 + df['是否独生子女家庭负担'] * 1.5 + (df['婚姻状态'] == '已婚').astype(int) * 0.6 ) df['健康风险指数'] = ( df['是否慢性病史'] * 2 + df['年度体检异常标记'] * 1.2 + (df['BMI'] >= 28).astype(int) * 1.1 + df['是否吸烟'] * 0.8 + df['是否饮酒'] * 0.4 + (df['近30天睡眠时长均值'] < 6.5).astype(int) * 1.2 ) df['岗位稳定性指数'] = ( df['司龄年数'] * 0.3 + (df['绩效等级'] == 'A').astype(int) * 1.2 + (df['绩效等级'] == 'B').astype(int) * 0.8 - df['近12月违纪次数'] * 0.7 ) df['节假日风险标记'] = ( (df['是否节假日前后'] == 1) | (df['请假类型'].isin(['事假', '年假', '调休'])) ).astype(int) df['排班压力标记'] = ( (df['班次类型'].isin(['两班倒', '三班倒'])) | (df['是否夜班岗位'] == 1) ).astype(int) df['缺勤历史强度'] = df['近90天缺勤次数'] * 1.5 + df['近180天请假总时长'] / 12 df['生活规律指数'] = ( df['近30天睡眠时长均值'] * 0.6 + df['每周运动频次'] * 0.7 - df['是否吸烟'] * 1.1 - df['是否饮酒'] * 0.5 ) df['管理负荷指数'] = df['团队人数'] * 0.4 + df['直属上级管理跨度'] * 0.25 df['工龄分层'] = pd.cut(df['司龄年数'], bins=[0, 2, 5, 10, 40], labels=['1', '2', '3', '4']) df['年龄分层'] = pd.cut(df['年龄'], bins=[18, 25, 32, 40, 60], labels=['1', '2', '3', '4']) df['通勤分层'] = pd.cut(df['通勤时长分钟'], bins=[0, 25, 45, 70, 180], labels=['1', '2', '3', '4']) df['加班分层'] = pd.cut(df['月均加班时长'], bins=[-1, 10, 25, 45, 120], labels=['1', '2', '3', '4']) return df def fit_label_encoders(df, ordinal_columns=None): ordinal_columns = ordinal_columns or ORDINAL_COLUMNS df = df.copy() encoders = {} object_columns = df.select_dtypes(include=['object', 'category']).columns.tolist() encode_columns = sorted(set(object_columns + [col for col in ordinal_columns if col in df.columns])) for col in encode_columns: encoder = LabelEncoder() df[col] = encoder.fit_transform(df[col].astype(str)) encoders[col] = encoder return df, encoders def apply_label_encoders(df, encoders): df = df.copy() for col, encoder in encoders.items(): if col not in df.columns: continue value_map = {cls: idx for idx, cls in enumerate(encoder.classes_)} df[col] = df[col].astype(str).map(lambda value: value_map.get(value, 0)) return df def extract_xy(df): y = df[TARGET_COLUMN].values if TARGET_COLUMN in df.columns else None X_df = df.drop(columns=[TARGET_COLUMN]) if TARGET_COLUMN in df.columns else df.copy() return X_df, y def build_prediction_dataframe(data): feature_row = { '企业编号': 'PREDICT_COMPANY', '所属行业': data.get('industry', DEFAULT_PREDICTION_INPUT['industry']), '企业规模': data.get('company_size', DEFAULT_PREDICTION_INPUT['company_size']), '所在城市等级': data.get('city_tier', DEFAULT_PREDICTION_INPUT['city_tier']), '用工类型': data.get('employment_type', DEFAULT_PREDICTION_INPUT['employment_type']), '部门条线': data.get('department_line', '研发'), '岗位序列': data.get('job_family', DEFAULT_PREDICTION_INPUT['job_family']), '岗位级别': data.get('job_level', DEFAULT_PREDICTION_INPUT['job_level']), '员工编号': 'PREDICT_EMPLOYEE', '性别': data.get('gender', '男'), '年龄': data.get('age', DEFAULT_PREDICTION_INPUT['age']), '司龄年数': data.get('tenure_years', DEFAULT_PREDICTION_INPUT['tenure_years']), '最高学历': data.get('education_level', DEFAULT_PREDICTION_INPUT['education_level']), '婚姻状态': data.get('marital_status', DEFAULT_PREDICTION_INPUT['marital_status']), '是否本地户籍': data.get('local_hukou_flag', DEFAULT_PREDICTION_INPUT['local_hukou_flag']), '子女数量': data.get('children_count', DEFAULT_PREDICTION_INPUT['children_count']), '是否独生子女家庭负担': data.get( 'single_child_burden_flag', DEFAULT_PREDICTION_INPUT['single_child_burden_flag'], ), '居住类型': data.get('housing_type', '租房'), '班次类型': data.get('shift_type', DEFAULT_PREDICTION_INPUT['shift_type']), '是否夜班岗位': data.get('is_night_shift', DEFAULT_PREDICTION_INPUT['is_night_shift']), '月均加班时长': data.get( 'monthly_overtime_hours', DEFAULT_PREDICTION_INPUT['monthly_overtime_hours'], ), '近30天出勤天数': data.get( 'attendance_days_30d', DEFAULT_PREDICTION_INPUT['attendance_days_30d'], ), '近90天缺勤次数': data.get('absence_count_90d', DEFAULT_PREDICTION_INPUT['absence_count_90d']), '近180天请假总时长': data.get('leave_hours_180d', DEFAULT_PREDICTION_INPUT['leave_hours_180d']), '通勤时长分钟': data.get('commute_minutes', DEFAULT_PREDICTION_INPUT['commute_minutes']), '通勤距离公里': data.get('commute_km', DEFAULT_PREDICTION_INPUT['commute_km']), '是否跨城通勤': data.get( 'cross_city_commute', DEFAULT_PREDICTION_INPUT['cross_city_commute'], ), '绩效等级': data.get('performance_level', DEFAULT_PREDICTION_INPUT['performance_level']), '近12月违纪次数': data.get( 'disciplinary_count_12m', DEFAULT_PREDICTION_INPUT['disciplinary_count_12m'], ), '团队人数': data.get('team_size', DEFAULT_PREDICTION_INPUT['team_size']), '直属上级管理跨度': data.get('manager_span', DEFAULT_PREDICTION_INPUT['manager_span']), 'BMI': data.get('bmi', DEFAULT_PREDICTION_INPUT['bmi']), '是否慢性病史': data.get( 'chronic_disease_flag', DEFAULT_PREDICTION_INPUT['chronic_disease_flag'], ), '年度体检异常标记': data.get( 'annual_check_abnormal_flag', DEFAULT_PREDICTION_INPUT['annual_check_abnormal_flag'], ), '近30天睡眠时长均值': data.get('sleep_hours', DEFAULT_PREDICTION_INPUT['sleep_hours']), '每周运动频次': data.get( 'exercise_frequency', DEFAULT_PREDICTION_INPUT['exercise_frequency'], ), '是否吸烟': data.get('smoking_flag', DEFAULT_PREDICTION_INPUT['smoking_flag']), '是否饮酒': data.get('drinking_flag', DEFAULT_PREDICTION_INPUT['drinking_flag']), '心理压力等级': data.get('stress_level', DEFAULT_PREDICTION_INPUT['stress_level']), '是否长期久坐岗位': data.get( 'sedentary_job_flag', DEFAULT_PREDICTION_INPUT['sedentary_job_flag'], ), '缺勤月份': data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month']), '星期几': data.get('weekday', DEFAULT_PREDICTION_INPUT['weekday']), '是否节假日前后': data.get('near_holiday_flag', DEFAULT_PREDICTION_INPUT['near_holiday_flag']), '季节': _season_from_month(data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month'])), '请假申请渠道': data.get('leave_channel', DEFAULT_PREDICTION_INPUT['leave_channel']), '请假类型': data.get('leave_type', DEFAULT_PREDICTION_INPUT['leave_type']), '请假原因大类': data.get( 'leave_reason_category', DEFAULT_PREDICTION_INPUT['leave_reason_category'], ), '是否提供医院证明': data.get( 'medical_certificate_flag', DEFAULT_PREDICTION_INPUT['medical_certificate_flag'], ), '是否临时请假': data.get('urgent_leave_flag', DEFAULT_PREDICTION_INPUT['urgent_leave_flag']), '是否连续缺勤': data.get( 'continuous_absence_flag', DEFAULT_PREDICTION_INPUT['continuous_absence_flag'], ), '前一工作日是否加班': data.get( 'previous_day_overtime_flag', DEFAULT_PREDICTION_INPUT['previous_day_overtime_flag'], ), } return pd.DataFrame([feature_row]) def _season_from_month(month): month = int(month) if month in [12, 1, 2]: return 1 if month in [3, 4, 5]: return 2 if month in [6, 7, 8]: return 3 return 4 def align_feature_frame(df, feature_names): aligned = df.copy() for feature in feature_names: if feature not in aligned.columns: aligned[feature] = 0 return aligned[feature_names] def to_float_array(df): return df.values.astype(float)