feat: 升级深度学习模型为 Temporal Fusion Transformer 架构

- 将 LSTMMLPRegressor 重构为 TemporalFusionRegressor，采用 Transformer Encoder 替代 LSTM - 新增 LearnedAttentionPooling 和 GatedResidualBlock 模块增强模型表达能力 - 优化训练策略，使用 OneCycleLR 调度器和样本加权机制 - 改进缺勤事件采样算法，基于压力、健康、家庭等维度更精确地计算缺勤时长 - 更新 .gitignore 排除原始数据文件，删除不再使用的原始 CSV 文件
2026-03-20 16:30:08 +08:00
parent ff0fbf96f7
commit 77e38fd15b
6 changed files with 225 additions and 12835 deletions
--- a/backend/core/generate_dataset.py
+++ b/backend/core/generate_dataset.py
@@ -161,68 +161,109 @@ def sample_event(rng, employee):
    weekday = int(rng.integers(1, 8))
    near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
    leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
-    leave_type = weighted_choice(rng, leave_type_items, [0.3, 0.22, 0.12, 0.14, 0.03, 0.02, 0.06, 0.02, 0.09])
-    if employee['子女数量'] > 0 and rng.random() < 0.14:
-        reason_category = '子女照护'
+    leave_probs = [0.26, 0.22, 0.11, 0.14, 0.03, 0.02, 0.07, 0.03, 0.12]
+    if employee['是否慢性病史'] == 1 or employee['年度体检异常标记'] == 1:
+        leave_probs = [0.34, 0.18, 0.08, 0.1, 0.02, 0.02, 0.08, 0.04, 0.14]
+    elif employee['子女数量'] >= 2:
+        leave_probs = [0.22, 0.24, 0.1, 0.12, 0.03, 0.02, 0.12, 0.02, 0.13]
+    leave_type = weighted_choice(rng, leave_type_items, leave_probs)
+
+    if leave_type in ['病假', '工伤假']:
+        reason_category = weighted_choice(rng, ['身体不适', '就医复查', '职业疲劳'], [0.52, 0.3, 0.18])
+    elif leave_type == '产检育儿假':
+        reason_category = weighted_choice(rng, ['子女照护', '家庭事务', '身体不适'], [0.6, 0.25, 0.15])
+    elif leave_type in ['婚假', '丧假']:
+        reason_category = weighted_choice(rng, ['家庭事务', '突发事件'], [0.72, 0.28])
+    elif leave_type in ['年假', '调休']:
+        reason_category = weighted_choice(rng, ['职业疲劳', '家庭事务', '交通受阻'], [0.52, 0.28, 0.2])
    else:
        reason_category = weighted_choice(
            rng,
-            ['身体不适', '家庭事务', '交通受阻', '突发事件', '职业疲劳', '就医复查'],
-            [0.28, 0.19, 0.09, 0.11, 0.2, 0.13],
+            ['身体不适', '家庭事务', '子女照护', '交通受阻', '突发事件', '职业疲劳'],
+            [0.2, 0.22, 0.14, 0.12, 0.12, 0.2],
        )
-    medical_certificate = int(leave_type in ['病假', '工伤假'] or reason_category in ['身体不适', '就医复查'])
-    urgent_leave = int(rng.random() < (0.45 if leave_type in ['病假', '事假', '工伤假'] else 0.18))
-    continuous_absence = int(rng.random() < (0.2 if leave_type in ['病假', '产检育儿假', '工伤假'] else 0.08))
-    previous_overtime = int(rng.random() < min(0.85, employee['月均加班时长'] / 65))
+
+    medical_certificate = int(
+        leave_type in ['病假', '工伤假']
+        or reason_category in ['身体不适', '就医复查']
+        or (employee['是否慢性病史'] == 1 and leave_type == '其他')
+    )
+    urgent_leave = int(
+        leave_type in ['病假', '工伤假']
+        or reason_category in ['突发事件', '身体不适']
+        or (near_holiday == 0 and leave_type == '事假' and rng.random() < 0.35)
+    )
+    continuous_absence = int(
+        leave_type in ['病假', '工伤假', '产检育儿假']
+        and (employee['近90天缺勤次数'] >= 2 or employee['近180天请假总时长'] >= 28)
+    )
+    previous_overtime = int(
+        employee['月均加班时长'] >= 30
+        or (employee['月均加班时长'] >= 24 and weekday in [1, 2, 5])
+        or (employee['是否夜班岗位'] == 1 and rng.random() < 0.65)
+    )
    season = season_from_month(month)
    channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])

-    base = 0.95
-    base += min(employee['月均加班时长'] / 28, 1.8)
-    base += min(employee['通勤时长分钟'] / 65, 1.2)
-    base += employee['是否夜班岗位'] * 0.9
-    base += employee['是否慢性病史'] * 1.25
-    base += employee['年度体检异常标记'] * 0.6
-    base += 0.35 * employee['子女数量']
-    base += 0.5 if employee['心理压力等级'] == '高' else (0.2 if employee['心理压力等级'] == '中' else -0.1)
-    base += 0.4 if employee['是否跨城通勤'] else 0
-    base += 0.35 if previous_overtime else 0
-    base += 0.35 if near_holiday else 0
-    base += 0.3 if continuous_absence else 0
-    base += 0.3 if employee['近90天缺勤次数'] >= 3 else 0
-    base -= 0.35 if employee['绩效等级'] == 'A' else (0.15 if employee['绩效等级'] == 'B' else 0)
-    base -= min(employee['司龄年数'] / 40, 0.5)
-    base -= min(employee['每周运动频次'] * 0.08, 0.3)
-    base -= 0.2 if employee['近30天睡眠时长均值'] >= 7.5 else 0
+    pressure_score = (
+        employee['月均加班时长'] * 0.032
+        + employee['通勤时长分钟'] * 0.018
+        + employee['是否夜班岗位'] * 0.75
+        + employee['是否跨城通勤'] * 0.32
+        + previous_overtime * 0.35
+    )
+    health_score = (
+        employee['是否慢性病史'] * 1.2
+        + employee['年度体检异常标记'] * 0.55
+        + (employee['BMI'] >= 28) * 0.3
+        + (employee['近30天睡眠时长均值'] < 6.4) * 0.45
+    )
+    family_score = employee['子女数量'] * 0.18 + employee['是否独生子女家庭负担'] * 0.28
+    resilience_score = (
+        (0.55 if employee['绩效等级'] == 'A' else 0.25 if employee['绩效等级'] == 'B' else 0.0)
+        + min(employee['司龄年数'] / 26, 0.65)
+        + min(employee['每周运动频次'] * 0.06, 0.25)
+    )
+
+    base = 0.35
+    base += pressure_score
+    base += health_score
+    base += family_score
+    base += 0.4 if employee['心理压力等级'] == '高' else (0.18 if employee['心理压力等级'] == '中' else -0.05)
+    base += 0.18 if near_holiday else 0.0
+    base += 0.35 if continuous_absence else 0.0
+    base += 0.28 if employee['近90天缺勤次数'] >= 3 else 0.0
+    base += 0.18 if employee['近180天请假总时长'] >= 36 else 0.0
+    base -= resilience_score

    leave_bonus = {
-        '病假': 2.0,
+        '病假': 2.1,
        '事假': 0.8,
-        '年假': 0.1,
+        '年假': 0.15,
        '调休': 0.1,
-        '婚假': 3.0,
+        '婚假': 3.1,
        '丧假': 2.8,
-        '产检育儿假': 2.4,
-        '工伤假': 3.8,
-        '其他': 0.5,
+        '产检育儿假': 2.35,
+        '工伤假': 3.9,
+        '其他': 0.55,
    }
    reason_bonus = {
        '身体不适': 1.0,
-        '家庭事务': 0.5,
-        '子女照护': 0.8,
+        '家庭事务': 0.55,
+        '子女照护': 0.75,
        '交通受阻': 0.2,
        '突发事件': 0.6,
        '职业疲劳': 0.7,
-        '就医复查': 1.2,
+        '就医复查': 1.15,
    }
    industry_bonus = {
-        '制造业': 0.35,
-        '互联网': 0.2,
-        '零售连锁': 0.25,
-        '物流运输': 0.4,
-        '金融服务': 0.1,
-        '医药健康': 0.2,
-        '建筑工程': 0.35,
+        '制造业': 0.42,
+        '互联网': 0.22,
+        '零售连锁': 0.28,
+        '物流运输': 0.5,
+        '金融服务': 0.12,
+        '医药健康': 0.24,
+        '建筑工程': 0.4,
    }
    season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
    weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}
@@ -233,18 +274,21 @@ def sample_event(rng, employee):
    duration += industry_bonus[employee['所属行业']]
    duration += season_bonus[season]
    duration += weekday_bonus[weekday]
-    duration += 0.55 if medical_certificate else 0
-    duration += 0.4 if urgent_leave else -0.05
-    duration += rng.normal(0, 0.9)
+    duration += 0.55 if medical_certificate else 0.0
+    duration += 0.28 if urgent_leave else -0.06

-    if leave_type in ['婚假', '丧假', '工伤假'] and rng.random() < 0.5:
-        duration += rng.uniform(1.5, 5)
-    if leave_type == '病假' and employee['是否慢性病史'] == 1 and rng.random() < 0.35:
-        duration += rng.uniform(1, 4)
+    if leave_type == '病假' and employee['是否慢性病史'] == 1:
+        duration += 0.85
+    if leave_type == '工伤假':
+        duration += 1.0 + employee['是否夜班岗位'] * 0.3
+    if leave_type in ['婚假', '丧假']:
+        duration += 0.7 + 0.18 * near_holiday
+    if leave_type == '产检育儿假':
+        duration += 0.55 + employee['子女数量'] * 0.12
    if leave_type in ['年假', '调休']:
-        duration *= rng.uniform(0.7, 0.95)
+        duration *= 0.82 if near_holiday == 0 else 0.9

-    duration = round(float(np.clip(duration, 0.5, 24.0)), 1)
+    duration = round(float(np.clip(duration + rng.normal(0, 0.35), 0.5, 18.0)), 1)

    event = employee.copy()
    event.update({