import os import sys import numpy as np import pandas as pd sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import config INDUSTRIES = { '制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8}, '互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2}, '零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3}, '物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9}, '金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1}, '医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5}, '建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3}, } def season_from_month(month): if month in [12, 1, 2]: return 1 if month in [3, 4, 5]: return 2 if month in [6, 7, 8]: return 3 return 4 def weighted_choice(rng, items, probs): probs = np.array(probs, dtype=float) probs = probs / probs.sum() return rng.choice(items, p=probs) def build_company_pool(rng, company_count=180): industries = list(INDUSTRIES.keys()) scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上'] city_tiers = ['一线', '新一线', '二线', '三线及以下'] companies = [] for idx in range(company_count): industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14]) companies.append({ '企业编号': f'C{idx + 1:03d}', '所属行业': industry, '企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]), '所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]), }) return companies def build_employee_pool(rng, companies, employee_count=2600): genders = ['男', '女'] employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生'] departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营'] job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席'] job_levels = ['初级', '中级', '高级', '主管', '经理及以上'] educations = ['中专及以下', '大专', '本科', '硕士', '博士'] marital = ['未婚', '已婚', '离异/其他'] housing = ['自有住房', '租房', '宿舍'] shifts = ['标准白班', '两班倒', '三班倒', '弹性班'] performance = ['A', 'B', 'C', 'D'] stress = ['低', '中', '高'] employees = [] for idx in range(employee_count): company = companies[rng.integers(0, len(companies))] industry = company['所属行业'] age = int(np.clip(rng.normal(33, 7), 20, 55)) tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1) family_bias = 0.6 if age >= 30 else 0.25 married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1]) children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3)) industry_profile = INDUSTRIES[industry] shift = weighted_choice( rng, shifts, [ max(0.1, 1 - industry_profile['shift_bias']), 0.35 * industry_profile['shift_bias'], 0.25 * industry_profile['shift_bias'], 0.2, ], ) night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias'])) overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90)) commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130)) commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65)) performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1]) chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01)) check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14) sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1) exercise = int(np.clip(rng.poisson(2.2), 0, 7)) smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08)) drinking = int(rng.random() < 0.27) stress_level = weighted_choice( rng, stress, [0.22, 0.52, 0.26 + min(0.15, overtime / 120)], ) bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1) history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8)) history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120)) discipline = int(np.clip(rng.poisson(0.2), 0, 4)) team_size = int(np.clip(rng.normal(11, 5), 3, 40)) manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60)) local_hukou = int(rng.random() < 0.58) cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35)) sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3) employees.append({ '企业编号': company['企业编号'], '所属行业': industry, '企业规模': company['企业规模'], '所在城市等级': company['所在城市等级'], '用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]), '部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]), '岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]), '岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]), '员工编号': f'E{idx + 1:05d}', '性别': weighted_choice(rng, genders, [0.56, 0.44]), '年龄': age, '司龄年数': tenure, '最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]), '婚姻状态': married, '是否本地户籍': local_hukou, '子女数量': children, '是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)), '居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]), '班次类型': shift, '是否夜班岗位': night_flag, '月均加班时长': round(overtime, 1), '近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)), '近90天缺勤次数': history_count, '近180天请假总时长': round(history_hours, 1), '通勤时长分钟': round(commute_minutes, 1), '通勤距离公里': round(commute_km, 1), '是否跨城通勤': cross_city, '绩效等级': performance_level, '近12月违纪次数': discipline, '团队人数': team_size, '直属上级管理跨度': manager_span, 'BMI': bmi, '是否慢性病史': chronic_flag, '年度体检异常标记': check_abnormal, '近30天睡眠时长均值': sleep_hours, '每周运动频次': exercise, '是否吸烟': smoking, '是否饮酒': drinking, '心理压力等级': stress_level, '是否长期久坐岗位': sedentary, }) return employees def sample_event(rng, employee): month = int(rng.integers(1, 13)) weekday = int(rng.integers(1, 8)) near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16)) leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他'] leave_probs = [0.26, 0.22, 0.11, 0.14, 0.03, 0.02, 0.07, 0.03, 0.12] if employee['是否慢性病史'] == 1 or employee['年度体检异常标记'] == 1: leave_probs = [0.34, 0.18, 0.08, 0.1, 0.02, 0.02, 0.08, 0.04, 0.14] elif employee['子女数量'] >= 2: leave_probs = [0.22, 0.24, 0.1, 0.12, 0.03, 0.02, 0.12, 0.02, 0.13] leave_type = weighted_choice(rng, leave_type_items, leave_probs) if leave_type in ['病假', '工伤假']: reason_category = weighted_choice(rng, ['身体不适', '就医复查', '职业疲劳'], [0.52, 0.3, 0.18]) elif leave_type == '产检育儿假': reason_category = weighted_choice(rng, ['子女照护', '家庭事务', '身体不适'], [0.6, 0.25, 0.15]) elif leave_type in ['婚假', '丧假']: reason_category = weighted_choice(rng, ['家庭事务', '突发事件'], [0.72, 0.28]) elif leave_type in ['年假', '调休']: reason_category = weighted_choice(rng, ['职业疲劳', '家庭事务', '交通受阻'], [0.52, 0.28, 0.2]) else: reason_category = weighted_choice( rng, ['身体不适', '家庭事务', '子女照护', '交通受阻', '突发事件', '职业疲劳'], [0.2, 0.22, 0.14, 0.12, 0.12, 0.2], ) medical_certificate = int( leave_type in ['病假', '工伤假'] or reason_category in ['身体不适', '就医复查'] or (employee['是否慢性病史'] == 1 and leave_type == '其他') ) urgent_leave = int( leave_type in ['病假', '工伤假'] or reason_category in ['突发事件', '身体不适'] or (near_holiday == 0 and leave_type == '事假' and rng.random() < 0.35) ) continuous_absence = int( leave_type in ['病假', '工伤假', '产检育儿假'] and (employee['近90天缺勤次数'] >= 2 or employee['近180天请假总时长'] >= 28) ) previous_overtime = int( employee['月均加班时长'] >= 30 or (employee['月均加班时长'] >= 24 and weekday in [1, 2, 5]) or (employee['是否夜班岗位'] == 1 and rng.random() < 0.65) ) season = season_from_month(month) channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14]) pressure_score = ( employee['月均加班时长'] * 0.032 + employee['通勤时长分钟'] * 0.018 + employee['是否夜班岗位'] * 0.75 + employee['是否跨城通勤'] * 0.32 + previous_overtime * 0.35 ) health_score = ( employee['是否慢性病史'] * 1.2 + employee['年度体检异常标记'] * 0.55 + (employee['BMI'] >= 28) * 0.3 + (employee['近30天睡眠时长均值'] < 6.4) * 0.45 ) family_score = employee['子女数量'] * 0.18 + employee['是否独生子女家庭负担'] * 0.28 resilience_score = ( (0.55 if employee['绩效等级'] == 'A' else 0.25 if employee['绩效等级'] == 'B' else 0.0) + min(employee['司龄年数'] / 26, 0.65) + min(employee['每周运动频次'] * 0.06, 0.25) ) base = 0.35 base += pressure_score base += health_score base += family_score base += 0.4 if employee['心理压力等级'] == '高' else (0.18 if employee['心理压力等级'] == '中' else -0.05) base += 0.18 if near_holiday else 0.0 base += 0.35 if continuous_absence else 0.0 base += 0.28 if employee['近90天缺勤次数'] >= 3 else 0.0 base += 0.18 if employee['近180天请假总时长'] >= 36 else 0.0 base -= resilience_score leave_bonus = { '病假': 2.1, '事假': 0.8, '年假': 0.15, '调休': 0.1, '婚假': 3.1, '丧假': 2.8, '产检育儿假': 2.35, '工伤假': 3.9, '其他': 0.55, } reason_bonus = { '身体不适': 1.0, '家庭事务': 0.55, '子女照护': 0.75, '交通受阻': 0.2, '突发事件': 0.6, '职业疲劳': 0.7, '就医复查': 1.15, } industry_bonus = { '制造业': 0.42, '互联网': 0.22, '零售连锁': 0.28, '物流运输': 0.5, '金融服务': 0.12, '医药健康': 0.24, '建筑工程': 0.4, } season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05} weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3} duration = base duration += leave_bonus[leave_type] duration += reason_bonus[reason_category] duration += industry_bonus[employee['所属行业']] duration += season_bonus[season] duration += weekday_bonus[weekday] duration += 0.55 if medical_certificate else 0.0 duration += 0.28 if urgent_leave else -0.06 if leave_type == '病假' and employee['是否慢性病史'] == 1: duration += 0.85 if leave_type == '工伤假': duration += 1.0 + employee['是否夜班岗位'] * 0.3 if leave_type in ['婚假', '丧假']: duration += 0.7 + 0.18 * near_holiday if leave_type == '产检育儿假': duration += 0.55 + employee['子女数量'] * 0.12 if leave_type in ['年假', '调休']: duration *= 0.82 if near_holiday == 0 else 0.9 duration = round(float(np.clip(duration + rng.normal(0, 0.35), 0.5, 18.0)), 1) event = employee.copy() event.update({ '缺勤月份': month, '星期几': weekday, '是否节假日前后': near_holiday, '季节': season, '请假申请渠道': channel, '请假类型': leave_type, '请假原因大类': reason_category, '是否提供医院证明': medical_certificate, '是否临时请假': urgent_leave, '是否连续缺勤': continuous_absence, '前一工作日是否加班': previous_overtime, '缺勤时长(小时)': duration, }) return event def attach_event_timeline(df): df = df.copy() rng = np.random.default_rng(config.RANDOM_STATE) base_date = np.datetime64('2025-01-01') timelines = [] for employee_id, group in df.groupby('员工编号', sort=False): group = group.copy().reset_index(drop=True) event_count = len(group) offsets = np.sort(rng.integers(0, 365, size=event_count)) group['事件日期'] = [ str(pd.Timestamp(base_date + np.timedelta64(int(offset), 'D')).date()) for offset in offsets ] group['事件日期索引'] = offsets.astype(int) group['事件序号'] = np.arange(1, event_count + 1) group['员工历史事件数'] = event_count timelines.append(group) return pd.concat(timelines, ignore_index=True) def validate_dataset(df): required_columns = [ '员工编号', '所属行业', '岗位序列', '月均加班时长', '通勤时长分钟', '是否慢性病史', '请假类型', '事件序号', '事件日期索引', '员工历史事件数', '缺勤时长(小时)', ] for column in required_columns: if column not in df.columns: raise ValueError(f'Missing required column: {column}') if len(df) < 10000: raise ValueError('Synthetic dataset is smaller than expected') if df['员工编号'].nunique() < 2000: raise ValueError('Employee coverage is too small') high_risk_ratio = (df['缺勤时长(小时)'] > 8).mean() if not 0.15 <= high_risk_ratio <= 0.4: raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}') medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长(小时)'].mean() no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长(小时)'].mean() if medical_mean <= no_medical_mean: raise ValueError('Medical certificate signal is not effective') night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长(小时)'].mean() day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长(小时)'].mean() if night_mean <= day_mean: raise ValueError('Night shift signal is not effective') def generate_dataset(output_path=None, sample_count=12000, random_state=None): rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state) companies = build_company_pool(rng) employees = build_employee_pool(rng, companies) events = [] employee_idx = rng.integers(0, len(employees), size=sample_count) for idx in employee_idx: events.append(sample_event(rng, employees[int(idx)])) df = attach_event_timeline(pd.DataFrame(events)) validate_dataset(df) if output_path: os.makedirs(os.path.dirname(output_path), exist_ok=True) df.to_csv(output_path, index=False, encoding='utf-8-sig') return df def enrich_with_jdr_columns(df): """为现有数据追加 JD-R(工作要求-资源)理论维度列。 在已有的员工/事件属性基础上,合成 16 个新列: - 工作要求:工作自主性、情绪劳动强度、时间压力感知、角色模糊度、工作家庭冲突 - 工作资源:上级支持、同事支持、技能多样性、职业发展机会、参与决策、组织公平感 - 个人资源:自我效能感、心理韧性、乐观程度 - 中介变量:工作倦怠、工作投入 """ rng = np.random.default_rng(config.RANDOM_STATE + 100) df = df.copy() n = len(df) # ── 辅助:条件性 Likert 生成 ── def likert(mean_offset, std=0.8, low=1.0, high=5.0): return np.clip(rng.normal(mean_offset, std, size=n), low, high) # ── 预提取列 ── overtime = df['月均加班时长'].values commute = df['通勤时长分钟'].values night = df['是否夜班岗位'].values children = df['子女数量'].values married_arr = (df['婚姻状态'] == '已婚').astype(int).values tenure = df['司龄年数'].values team_size = df['团队人数'].values manager_span = df['直属上级管理跨度'].values exercise = df['每周运动频次'].values sleep = df['近30天睡眠时长均值'].values chronic = df['是否慢性病史'].values perf_a = (df['绩效等级'] == 'A').astype(int).values perf_ab = df['绩效等级'].isin(['A', 'B']).astype(int).values level_map = {'初级': 0, '中级': 1, '高级': 2, '主管': 3, '经理及以上': 4} level_vals = df['岗位级别'].map(level_map).fillna(1).values industry_vals = df['所属行业'].values employment_type = df['用工类型'].values job_family = df['岗位序列'].values company_scale_map = { '100人以下': 0, '100-499人': 1, '500-999人': 2, '1000-4999人': 3, '5000人及以上': 4 } scale_vals = df['企业规模'].map(company_scale_map).fillna(1).values formal_employee = (df['用工类型'] == '正式员工').astype(int).values edu_map = {'中专及以下': 0, '大专': 1, '本科': 2, '硕士': 3, '博士': 4} edu_vals = df['最高学历'].map(edu_map).fillna(2).values # ── 工作要求维度 (5 列) ── df['工作自主性'] = likert( 3.2 + level_vals * 0.25 + np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.3 - night * 0.4 ).round(1) df['情绪劳动强度'] = likert( 2.8 + np.isin(job_family, ['客服坐席', '销售业务']).astype(int) * 0.6 + np.isin(industry_vals, ['医药健康', '零售连锁']).astype(int) * 0.3 ).round(1) df['时间压力感知'] = likert( 3.0 + overtime * 0.02 + commute * 0.01 + np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.2 ).round(1) df['角色模糊度'] = likert( 2.5 + np.isin(employment_type, ['劳务派遣', '外包驻场']).astype(int) * 0.5 - tenure * 0.05 ).round(1) df['工作家庭冲突'] = likert( 2.6 + overtime * 0.02 + children * 0.3 + married_arr * 0.3 ).round(1) # ── 工作资源维度 (6 列) ── df['上级支持'] = likert( 3.4 - manager_span * 0.02 + level_vals * 0.2 ).round(1) df['同事支持'] = likert( 3.3 + team_size * 0.02 + np.isin(job_family, ['管理', '专业技术']).astype(int) * 0.2 ).round(1) df['技能多样性'] = likert( 3.0 + np.isin(job_family, ['专业技术', '管理']).astype(int) * 0.5 - np.isin(job_family, ['生产操作']).astype(int) * 0.3 ).round(1) df['职业发展机会'] = likert( 3.1 + np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.4 + scale_vals * 0.1 ).round(1) df['参与决策'] = likert( 2.8 + level_vals * 0.35 ).round(1) df['组织公平感'] = likert( 3.3 + formal_employee * 0.4 + perf_ab * 0.3 ).round(1) # ── 个人资源维度 (3 列) ── df['自我效能感'] = likert( 3.3 + perf_a * 0.4 + perf_ab * 0.2 + tenure * 0.03 + edu_vals * 0.08 ).round(1) df['心理韧性'] = likert( 3.2 + exercise * 0.1 + sleep * 0.15 + tenure * 0.02 ).round(1) df['乐观程度'] = likert( 3.3 + perf_ab * 0.3 - chronic * 0.3 + married_arr * 0.15 ).round(1) # ── 中介变量 (2 列) ── # 工作倦怠 (1-7):健康损伤过程 — 高需求→高倦怠 df['工作倦怠'] = np.clip( rng.normal(3.0, 0.8, size=n) + overtime * 0.015 + night * 0.3 + commute * 0.008 + df['情绪劳动强度'].values * 0.25 + df['时间压力感知'].values * 0.25 + df['工作家庭冲突'].values * 0.2 + df['角色模糊度'].values * 0.15 - df['工作自主性'].values * 0.2 - df['上级支持'].values * 0.15 - df['自我效能感'].values * 0.2 - df['心理韧性'].values * 0.15, 1.0, 7.0 ).round(1) # 工作投入 (1-7):激励过程 — 高资源→高投入 df['工作投入'] = np.clip( rng.normal(3.5, 0.8, size=n) + df['工作自主性'].values * 0.2 + df['上级支持'].values * 0.2 + df['同事支持'].values * 0.15 + df['技能多样性'].values * 0.15 + df['职业发展机会'].values * 0.15 + df['参与决策'].values * 0.1 + df['组织公平感'].values * 0.1 + df['自我效能感'].values * 0.2 + df['心理韧性'].values * 0.15 + df['乐观程度'].values * 0.15 - df['工作倦怠'].values * 0.2, 1.0, 7.0 ).round(1) # JD-R 数据版本标记 df['_jdr_version'] = config.JDR_DATA_VERSION return df def ensure_dataset(): needs_regenerate = not os.path.exists(config.RAW_DATA_PATH) if not needs_regenerate: try: df = pd.read_csv(config.RAW_DATA_PATH) validate_dataset(df) except Exception: needs_regenerate = True if needs_regenerate: generate_dataset(config.RAW_DATA_PATH) df = pd.read_csv(config.RAW_DATA_PATH) # 检查是否需要 JD-R 数据丰富 jdr_columns = ['工作自主性', '上级支持', '自我效能感', '工作倦怠', '工作投入'] if not all(col in df.columns for col in jdr_columns): df = enrich_with_jdr_columns(df) os.makedirs(os.path.dirname(config.RAW_DATA_PATH), exist_ok=True) df.to_csv(config.RAW_DATA_PATH, index=False, encoding='utf-8-sig') if __name__ == '__main__': dataset = generate_dataset(config.RAW_DATA_PATH) print(f'Generated dataset: {config.RAW_DATA_PATH}') print(dataset.head())