forsetsystem/backend/core/generate_dataset.py

import os
import sys

import numpy as np
import pandas as pd

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import config


INDUSTRIES = {
    '制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8},
    '互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2},
    '零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3},
    '物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9},
    '金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1},
    '医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5},
    '建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3},
}


def season_from_month(month):
    if month in [12, 1, 2]:
        return 1
    if month in [3, 4, 5]:
        return 2
    if month in [6, 7, 8]:
        return 3
    return 4


def weighted_choice(rng, items, probs):
    probs = np.array(probs, dtype=float)
    probs = probs / probs.sum()
    return rng.choice(items, p=probs)


def build_company_pool(rng, company_count=180):
    industries = list(INDUSTRIES.keys())
    scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上']
    city_tiers = ['一线', '新一线', '二线', '三线及以下']
    companies = []
    for idx in range(company_count):
        industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14])
        companies.append({
            '企业编号': f'C{idx + 1:03d}',
            '所属行业': industry,
            '企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]),
            '所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]),
        })
    return companies


def build_employee_pool(rng, companies, employee_count=2600):
    genders = ['男', '女']
    employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生']
    departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营']
    job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席']
    job_levels = ['初级', '中级', '高级', '主管', '经理及以上']
    educations = ['中专及以下', '大专', '本科', '硕士', '博士']
    marital = ['未婚', '已婚', '离异/其他']
    housing = ['自有住房', '租房', '宿舍']
    shifts = ['标准白班', '两班倒', '三班倒', '弹性班']
    performance = ['A', 'B', 'C', 'D']
    stress = ['低', '中', '高']

    employees = []
    for idx in range(employee_count):
        company = companies[rng.integers(0, len(companies))]
        industry = company['所属行业']
        age = int(np.clip(rng.normal(33, 7), 20, 55))
        tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1)
        family_bias = 0.6 if age >= 30 else 0.25
        married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1])
        children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3))
        industry_profile = INDUSTRIES[industry]
        shift = weighted_choice(
            rng,
            shifts,
            [
                max(0.1, 1 - industry_profile['shift_bias']),
                0.35 * industry_profile['shift_bias'],
                0.25 * industry_profile['shift_bias'],
                0.2,
            ],
        )
        night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias']))
        overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90))
        commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130))
        commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65))
        performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1])
        chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01))
        check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14)
        sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1)
        exercise = int(np.clip(rng.poisson(2.2), 0, 7))
        smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08))
        drinking = int(rng.random() < 0.27)
        stress_level = weighted_choice(
            rng,
            stress,
            [0.22, 0.52, 0.26 + min(0.15, overtime / 120)],
        )
        bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1)
        history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8))
        history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120))
        discipline = int(np.clip(rng.poisson(0.2), 0, 4))
        team_size = int(np.clip(rng.normal(11, 5), 3, 40))
        manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60))
        local_hukou = int(rng.random() < 0.58)
        cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35))
        sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3)

        employees.append({
            '企业编号': company['企业编号'],
            '所属行业': industry,
            '企业规模': company['企业规模'],
            '所在城市等级': company['所在城市等级'],
            '用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]),
            '部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]),
            '岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]),
            '岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]),
            '员工编号': f'E{idx + 1:05d}',
            '性别': weighted_choice(rng, genders, [0.56, 0.44]),
            '年龄': age,
            '司龄年数': tenure,
            '最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]),
            '婚姻状态': married,
            '是否本地户籍': local_hukou,
            '子女数量': children,
            '是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)),
            '居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]),
            '班次类型': shift,
            '是否夜班岗位': night_flag,
            '月均加班时长': round(overtime, 1),
            '近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)),
            '近90天缺勤次数': history_count,
            '近180天请假总时长': round(history_hours, 1),
            '通勤时长分钟': round(commute_minutes, 1),
            '通勤距离公里': round(commute_km, 1),
            '是否跨城通勤': cross_city,
            '绩效等级': performance_level,
            '近12月违纪次数': discipline,
            '团队人数': team_size,
            '直属上级管理跨度': manager_span,
            'BMI': bmi,
            '是否慢性病史': chronic_flag,
            '年度体检异常标记': check_abnormal,
            '近30天睡眠时长均值': sleep_hours,
            '每周运动频次': exercise,
            '是否吸烟': smoking,
            '是否饮酒': drinking,
            '心理压力等级': stress_level,
            '是否长期久坐岗位': sedentary,
        })
    return employees


def sample_event(rng, employee):
    month = int(rng.integers(1, 13))
    weekday = int(rng.integers(1, 8))
    near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
    leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
    leave_probs = [0.26, 0.22, 0.11, 0.14, 0.03, 0.02, 0.07, 0.03, 0.12]
    if employee['是否慢性病史'] == 1 or employee['年度体检异常标记'] == 1:
        leave_probs = [0.34, 0.18, 0.08, 0.1, 0.02, 0.02, 0.08, 0.04, 0.14]
    elif employee['子女数量'] >= 2:
        leave_probs = [0.22, 0.24, 0.1, 0.12, 0.03, 0.02, 0.12, 0.02, 0.13]
    leave_type = weighted_choice(rng, leave_type_items, leave_probs)

    if leave_type in ['病假', '工伤假']:
        reason_category = weighted_choice(rng, ['身体不适', '就医复查', '职业疲劳'], [0.52, 0.3, 0.18])
    elif leave_type == '产检育儿假':
        reason_category = weighted_choice(rng, ['子女照护', '家庭事务', '身体不适'], [0.6, 0.25, 0.15])
    elif leave_type in ['婚假', '丧假']:
        reason_category = weighted_choice(rng, ['家庭事务', '突发事件'], [0.72, 0.28])
    elif leave_type in ['年假', '调休']:
        reason_category = weighted_choice(rng, ['职业疲劳', '家庭事务', '交通受阻'], [0.52, 0.28, 0.2])
    else:
        reason_category = weighted_choice(
            rng,
            ['身体不适', '家庭事务', '子女照护', '交通受阻', '突发事件', '职业疲劳'],
            [0.2, 0.22, 0.14, 0.12, 0.12, 0.2],
        )

    medical_certificate = int(
        leave_type in ['病假', '工伤假']
        or reason_category in ['身体不适', '就医复查']
        or (employee['是否慢性病史'] == 1 and leave_type == '其他')
    )
    urgent_leave = int(
        leave_type in ['病假', '工伤假']
        or reason_category in ['突发事件', '身体不适']
        or (near_holiday == 0 and leave_type == '事假' and rng.random() < 0.35)
    )
    continuous_absence = int(
        leave_type in ['病假', '工伤假', '产检育儿假']
        and (employee['近90天缺勤次数'] >= 2 or employee['近180天请假总时长'] >= 28)
    )
    previous_overtime = int(
        employee['月均加班时长'] >= 30
        or (employee['月均加班时长'] >= 24 and weekday in [1, 2, 5])
        or (employee['是否夜班岗位'] == 1 and rng.random() < 0.65)
    )
    season = season_from_month(month)
    channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])

    pressure_score = (
        employee['月均加班时长'] * 0.032
        + employee['通勤时长分钟'] * 0.018
        + employee['是否夜班岗位'] * 0.75
        + employee['是否跨城通勤'] * 0.32
        + previous_overtime * 0.35
    )
    health_score = (
        employee['是否慢性病史'] * 1.2
        + employee['年度体检异常标记'] * 0.55
        + (employee['BMI'] >= 28) * 0.3
        + (employee['近30天睡眠时长均值'] < 6.4) * 0.45
    )
    family_score = employee['子女数量'] * 0.18 + employee['是否独生子女家庭负担'] * 0.28
    resilience_score = (
        (0.55 if employee['绩效等级'] == 'A' else 0.25 if employee['绩效等级'] == 'B' else 0.0)
        + min(employee['司龄年数'] / 26, 0.65)
        + min(employee['每周运动频次'] * 0.06, 0.25)
    )

    base = 0.35
    base += pressure_score
    base += health_score
    base += family_score
    base += 0.4 if employee['心理压力等级'] == '高' else (0.18 if employee['心理压力等级'] == '中' else -0.05)
    base += 0.18 if near_holiday else 0.0
    base += 0.35 if continuous_absence else 0.0
    base += 0.28 if employee['近90天缺勤次数'] >= 3 else 0.0
    base += 0.18 if employee['近180天请假总时长'] >= 36 else 0.0
    base -= resilience_score

    leave_bonus = {
        '病假': 2.1,
        '事假': 0.8,
        '年假': 0.15,
        '调休': 0.1,
        '婚假': 3.1,
        '丧假': 2.8,
        '产检育儿假': 2.35,
        '工伤假': 3.9,
        '其他': 0.55,
    }
    reason_bonus = {
        '身体不适': 1.0,
        '家庭事务': 0.55,
        '子女照护': 0.75,
        '交通受阻': 0.2,
        '突发事件': 0.6,
        '职业疲劳': 0.7,
        '就医复查': 1.15,
    }
    industry_bonus = {
        '制造业': 0.42,
        '互联网': 0.22,
        '零售连锁': 0.28,
        '物流运输': 0.5,
        '金融服务': 0.12,
        '医药健康': 0.24,
        '建筑工程': 0.4,
    }
    season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
    weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}

    duration = base
    duration += leave_bonus[leave_type]
    duration += reason_bonus[reason_category]
    duration += industry_bonus[employee['所属行业']]
    duration += season_bonus[season]
    duration += weekday_bonus[weekday]
    duration += 0.55 if medical_certificate else 0.0
    duration += 0.28 if urgent_leave else -0.06

    if leave_type == '病假' and employee['是否慢性病史'] == 1:
        duration += 0.85
    if leave_type == '工伤假':
        duration += 1.0 + employee['是否夜班岗位'] * 0.3
    if leave_type in ['婚假', '丧假']:
        duration += 0.7 + 0.18 * near_holiday
    if leave_type == '产检育儿假':
        duration += 0.55 + employee['子女数量'] * 0.12
    if leave_type in ['年假', '调休']:
        duration *= 0.82 if near_holiday == 0 else 0.9

    duration = round(float(np.clip(duration + rng.normal(0, 0.35), 0.5, 18.0)), 1)

    event = employee.copy()
    event.update({
        '缺勤月份': month,
        '星期几': weekday,
        '是否节假日前后': near_holiday,
        '季节': season,
        '请假申请渠道': channel,
        '请假类型': leave_type,
        '请假原因大类': reason_category,
        '是否提供医院证明': medical_certificate,
        '是否临时请假': urgent_leave,
        '是否连续缺勤': continuous_absence,
        '前一工作日是否加班': previous_overtime,
        '缺勤时长（小时）': duration,
    })
    return event


def attach_event_timeline(df):
    df = df.copy()
    rng = np.random.default_rng(config.RANDOM_STATE)
    base_date = np.datetime64('2025-01-01')
    timelines = []

    for employee_id, group in df.groupby('员工编号', sort=False):
        group = group.copy().reset_index(drop=True)
        event_count = len(group)
        offsets = np.sort(rng.integers(0, 365, size=event_count))
        group['事件日期'] = [
            str(pd.Timestamp(base_date + np.timedelta64(int(offset), 'D')).date())
            for offset in offsets
        ]
        group['事件日期索引'] = offsets.astype(int)
        group['事件序号'] = np.arange(1, event_count + 1)
        group['员工历史事件数'] = event_count
        timelines.append(group)

    return pd.concat(timelines, ignore_index=True)


def validate_dataset(df):
    required_columns = [
        '员工编号',
        '所属行业',
        '岗位序列',
        '月均加班时长',
        '通勤时长分钟',
        '是否慢性病史',
        '请假类型',
        '事件序号',
        '事件日期索引',
        '员工历史事件数',
        '缺勤时长（小时）',
    ]
    for column in required_columns:
        if column not in df.columns:
            raise ValueError(f'Missing required column: {column}')

    if len(df) < 10000:
        raise ValueError('Synthetic dataset is smaller than expected')
    if df['员工编号'].nunique() < 2000:
        raise ValueError('Employee coverage is too small')

    high_risk_ratio = (df['缺勤时长（小时）'] > 8).mean()
    if not 0.15 <= high_risk_ratio <= 0.4:
        raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}')

    medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长（小时）'].mean()
    no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长（小时）'].mean()
    if medical_mean <= no_medical_mean:
        raise ValueError('Medical certificate signal is not effective')

    night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长（小时）'].mean()
    day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长（小时）'].mean()
    if night_mean <= day_mean:
        raise ValueError('Night shift signal is not effective')


def generate_dataset(output_path=None, sample_count=12000, random_state=None):
    rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state)
    companies = build_company_pool(rng)
    employees = build_employee_pool(rng, companies)

    events = []
    employee_idx = rng.integers(0, len(employees), size=sample_count)
    for idx in employee_idx:
        events.append(sample_event(rng, employees[int(idx)]))

    df = attach_event_timeline(pd.DataFrame(events))
    validate_dataset(df)

    if output_path:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df.to_csv(output_path, index=False, encoding='utf-8-sig')
    return df


def ensure_dataset():
    if not os.path.exists(config.RAW_DATA_PATH):
        generate_dataset(config.RAW_DATA_PATH)
        return

    try:
        df = pd.read_csv(config.RAW_DATA_PATH)
        validate_dataset(df)
    except Exception:
        generate_dataset(config.RAW_DATA_PATH)


if __name__ == '__main__':
    dataset = generate_dataset(config.RAW_DATA_PATH)
    print(f'Generated dataset: {config.RAW_DATA_PATH}')
    print(dataset.head())