feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
This commit is contained in:
336
backend/core/generate_dataset.py
Normal file
336
backend/core/generate_dataset.py
Normal file
@@ -0,0 +1,336 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import config
|
||||
|
||||
|
||||
INDUSTRIES = {
|
||||
'制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8},
|
||||
'互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2},
|
||||
'零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3},
|
||||
'物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9},
|
||||
'金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1},
|
||||
'医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5},
|
||||
'建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3},
|
||||
}
|
||||
|
||||
|
||||
def season_from_month(month):
|
||||
if month in [12, 1, 2]:
|
||||
return 1
|
||||
if month in [3, 4, 5]:
|
||||
return 2
|
||||
if month in [6, 7, 8]:
|
||||
return 3
|
||||
return 4
|
||||
|
||||
|
||||
def weighted_choice(rng, items, probs):
|
||||
probs = np.array(probs, dtype=float)
|
||||
probs = probs / probs.sum()
|
||||
return rng.choice(items, p=probs)
|
||||
|
||||
|
||||
def build_company_pool(rng, company_count=180):
|
||||
industries = list(INDUSTRIES.keys())
|
||||
scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上']
|
||||
city_tiers = ['一线', '新一线', '二线', '三线及以下']
|
||||
companies = []
|
||||
for idx in range(company_count):
|
||||
industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14])
|
||||
companies.append({
|
||||
'企业编号': f'C{idx + 1:03d}',
|
||||
'所属行业': industry,
|
||||
'企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]),
|
||||
'所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]),
|
||||
})
|
||||
return companies
|
||||
|
||||
|
||||
def build_employee_pool(rng, companies, employee_count=2600):
|
||||
genders = ['男', '女']
|
||||
employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生']
|
||||
departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营']
|
||||
job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席']
|
||||
job_levels = ['初级', '中级', '高级', '主管', '经理及以上']
|
||||
educations = ['中专及以下', '大专', '本科', '硕士', '博士']
|
||||
marital = ['未婚', '已婚', '离异/其他']
|
||||
housing = ['自有住房', '租房', '宿舍']
|
||||
shifts = ['标准白班', '两班倒', '三班倒', '弹性班']
|
||||
performance = ['A', 'B', 'C', 'D']
|
||||
stress = ['低', '中', '高']
|
||||
|
||||
employees = []
|
||||
for idx in range(employee_count):
|
||||
company = companies[rng.integers(0, len(companies))]
|
||||
industry = company['所属行业']
|
||||
age = int(np.clip(rng.normal(33, 7), 20, 55))
|
||||
tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1)
|
||||
family_bias = 0.6 if age >= 30 else 0.25
|
||||
married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1])
|
||||
children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3))
|
||||
industry_profile = INDUSTRIES[industry]
|
||||
shift = weighted_choice(
|
||||
rng,
|
||||
shifts,
|
||||
[
|
||||
max(0.1, 1 - industry_profile['shift_bias']),
|
||||
0.35 * industry_profile['shift_bias'],
|
||||
0.25 * industry_profile['shift_bias'],
|
||||
0.2,
|
||||
],
|
||||
)
|
||||
night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias']))
|
||||
overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90))
|
||||
commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130))
|
||||
commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65))
|
||||
performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1])
|
||||
chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01))
|
||||
check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14)
|
||||
sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1)
|
||||
exercise = int(np.clip(rng.poisson(2.2), 0, 7))
|
||||
smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08))
|
||||
drinking = int(rng.random() < 0.27)
|
||||
stress_level = weighted_choice(
|
||||
rng,
|
||||
stress,
|
||||
[0.22, 0.52, 0.26 + min(0.15, overtime / 120)],
|
||||
)
|
||||
bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1)
|
||||
history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8))
|
||||
history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120))
|
||||
discipline = int(np.clip(rng.poisson(0.2), 0, 4))
|
||||
team_size = int(np.clip(rng.normal(11, 5), 3, 40))
|
||||
manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60))
|
||||
local_hukou = int(rng.random() < 0.58)
|
||||
cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35))
|
||||
sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3)
|
||||
|
||||
employees.append({
|
||||
'企业编号': company['企业编号'],
|
||||
'所属行业': industry,
|
||||
'企业规模': company['企业规模'],
|
||||
'所在城市等级': company['所在城市等级'],
|
||||
'用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]),
|
||||
'部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]),
|
||||
'岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]),
|
||||
'岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]),
|
||||
'员工编号': f'E{idx + 1:05d}',
|
||||
'性别': weighted_choice(rng, genders, [0.56, 0.44]),
|
||||
'年龄': age,
|
||||
'司龄年数': tenure,
|
||||
'最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]),
|
||||
'婚姻状态': married,
|
||||
'是否本地户籍': local_hukou,
|
||||
'子女数量': children,
|
||||
'是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)),
|
||||
'居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]),
|
||||
'班次类型': shift,
|
||||
'是否夜班岗位': night_flag,
|
||||
'月均加班时长': round(overtime, 1),
|
||||
'近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)),
|
||||
'近90天缺勤次数': history_count,
|
||||
'近180天请假总时长': round(history_hours, 1),
|
||||
'通勤时长分钟': round(commute_minutes, 1),
|
||||
'通勤距离公里': round(commute_km, 1),
|
||||
'是否跨城通勤': cross_city,
|
||||
'绩效等级': performance_level,
|
||||
'近12月违纪次数': discipline,
|
||||
'团队人数': team_size,
|
||||
'直属上级管理跨度': manager_span,
|
||||
'BMI': bmi,
|
||||
'是否慢性病史': chronic_flag,
|
||||
'年度体检异常标记': check_abnormal,
|
||||
'近30天睡眠时长均值': sleep_hours,
|
||||
'每周运动频次': exercise,
|
||||
'是否吸烟': smoking,
|
||||
'是否饮酒': drinking,
|
||||
'心理压力等级': stress_level,
|
||||
'是否长期久坐岗位': sedentary,
|
||||
})
|
||||
return employees
|
||||
|
||||
|
||||
def sample_event(rng, employee):
|
||||
month = int(rng.integers(1, 13))
|
||||
weekday = int(rng.integers(1, 8))
|
||||
near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
|
||||
leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
|
||||
leave_type = weighted_choice(rng, leave_type_items, [0.3, 0.22, 0.12, 0.14, 0.03, 0.02, 0.06, 0.02, 0.09])
|
||||
if employee['子女数量'] > 0 and rng.random() < 0.14:
|
||||
reason_category = '子女照护'
|
||||
else:
|
||||
reason_category = weighted_choice(
|
||||
rng,
|
||||
['身体不适', '家庭事务', '交通受阻', '突发事件', '职业疲劳', '就医复查'],
|
||||
[0.28, 0.19, 0.09, 0.11, 0.2, 0.13],
|
||||
)
|
||||
medical_certificate = int(leave_type in ['病假', '工伤假'] or reason_category in ['身体不适', '就医复查'])
|
||||
urgent_leave = int(rng.random() < (0.45 if leave_type in ['病假', '事假', '工伤假'] else 0.18))
|
||||
continuous_absence = int(rng.random() < (0.2 if leave_type in ['病假', '产检育儿假', '工伤假'] else 0.08))
|
||||
previous_overtime = int(rng.random() < min(0.85, employee['月均加班时长'] / 65))
|
||||
season = season_from_month(month)
|
||||
channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])
|
||||
|
||||
base = 0.95
|
||||
base += min(employee['月均加班时长'] / 28, 1.8)
|
||||
base += min(employee['通勤时长分钟'] / 65, 1.2)
|
||||
base += employee['是否夜班岗位'] * 0.9
|
||||
base += employee['是否慢性病史'] * 1.25
|
||||
base += employee['年度体检异常标记'] * 0.6
|
||||
base += 0.35 * employee['子女数量']
|
||||
base += 0.5 if employee['心理压力等级'] == '高' else (0.2 if employee['心理压力等级'] == '中' else -0.1)
|
||||
base += 0.4 if employee['是否跨城通勤'] else 0
|
||||
base += 0.35 if previous_overtime else 0
|
||||
base += 0.35 if near_holiday else 0
|
||||
base += 0.3 if continuous_absence else 0
|
||||
base += 0.3 if employee['近90天缺勤次数'] >= 3 else 0
|
||||
base -= 0.35 if employee['绩效等级'] == 'A' else (0.15 if employee['绩效等级'] == 'B' else 0)
|
||||
base -= min(employee['司龄年数'] / 40, 0.5)
|
||||
base -= min(employee['每周运动频次'] * 0.08, 0.3)
|
||||
base -= 0.2 if employee['近30天睡眠时长均值'] >= 7.5 else 0
|
||||
|
||||
leave_bonus = {
|
||||
'病假': 2.0,
|
||||
'事假': 0.8,
|
||||
'年假': 0.1,
|
||||
'调休': 0.1,
|
||||
'婚假': 3.0,
|
||||
'丧假': 2.8,
|
||||
'产检育儿假': 2.4,
|
||||
'工伤假': 3.8,
|
||||
'其他': 0.5,
|
||||
}
|
||||
reason_bonus = {
|
||||
'身体不适': 1.0,
|
||||
'家庭事务': 0.5,
|
||||
'子女照护': 0.8,
|
||||
'交通受阻': 0.2,
|
||||
'突发事件': 0.6,
|
||||
'职业疲劳': 0.7,
|
||||
'就医复查': 1.2,
|
||||
}
|
||||
industry_bonus = {
|
||||
'制造业': 0.35,
|
||||
'互联网': 0.2,
|
||||
'零售连锁': 0.25,
|
||||
'物流运输': 0.4,
|
||||
'金融服务': 0.1,
|
||||
'医药健康': 0.2,
|
||||
'建筑工程': 0.35,
|
||||
}
|
||||
season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
|
||||
weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}
|
||||
|
||||
duration = base
|
||||
duration += leave_bonus[leave_type]
|
||||
duration += reason_bonus[reason_category]
|
||||
duration += industry_bonus[employee['所属行业']]
|
||||
duration += season_bonus[season]
|
||||
duration += weekday_bonus[weekday]
|
||||
duration += 0.55 if medical_certificate else 0
|
||||
duration += 0.4 if urgent_leave else -0.05
|
||||
duration += rng.normal(0, 0.9)
|
||||
|
||||
if leave_type in ['婚假', '丧假', '工伤假'] and rng.random() < 0.5:
|
||||
duration += rng.uniform(1.5, 5)
|
||||
if leave_type == '病假' and employee['是否慢性病史'] == 1 and rng.random() < 0.35:
|
||||
duration += rng.uniform(1, 4)
|
||||
if leave_type in ['年假', '调休']:
|
||||
duration *= rng.uniform(0.7, 0.95)
|
||||
|
||||
duration = round(float(np.clip(duration, 0.5, 24.0)), 1)
|
||||
|
||||
event = employee.copy()
|
||||
event.update({
|
||||
'缺勤月份': month,
|
||||
'星期几': weekday,
|
||||
'是否节假日前后': near_holiday,
|
||||
'季节': season,
|
||||
'请假申请渠道': channel,
|
||||
'请假类型': leave_type,
|
||||
'请假原因大类': reason_category,
|
||||
'是否提供医院证明': medical_certificate,
|
||||
'是否临时请假': urgent_leave,
|
||||
'是否连续缺勤': continuous_absence,
|
||||
'前一工作日是否加班': previous_overtime,
|
||||
'缺勤时长(小时)': duration,
|
||||
})
|
||||
return event
|
||||
|
||||
|
||||
def validate_dataset(df):
|
||||
required_columns = [
|
||||
'员工编号',
|
||||
'所属行业',
|
||||
'岗位序列',
|
||||
'月均加班时长',
|
||||
'通勤时长分钟',
|
||||
'是否慢性病史',
|
||||
'请假类型',
|
||||
'缺勤时长(小时)',
|
||||
]
|
||||
for column in required_columns:
|
||||
if column not in df.columns:
|
||||
raise ValueError(f'Missing required column: {column}')
|
||||
|
||||
if len(df) < 10000:
|
||||
raise ValueError('Synthetic dataset is smaller than expected')
|
||||
if df['员工编号'].nunique() < 2000:
|
||||
raise ValueError('Employee coverage is too small')
|
||||
|
||||
high_risk_ratio = (df['缺勤时长(小时)'] > 8).mean()
|
||||
if not 0.15 <= high_risk_ratio <= 0.4:
|
||||
raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}')
|
||||
|
||||
medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长(小时)'].mean()
|
||||
no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长(小时)'].mean()
|
||||
if medical_mean <= no_medical_mean:
|
||||
raise ValueError('Medical certificate signal is not effective')
|
||||
|
||||
night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长(小时)'].mean()
|
||||
day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长(小时)'].mean()
|
||||
if night_mean <= day_mean:
|
||||
raise ValueError('Night shift signal is not effective')
|
||||
|
||||
|
||||
def generate_dataset(output_path=None, sample_count=12000, random_state=None):
|
||||
rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state)
|
||||
companies = build_company_pool(rng)
|
||||
employees = build_employee_pool(rng, companies)
|
||||
|
||||
events = []
|
||||
employee_idx = rng.integers(0, len(employees), size=sample_count)
|
||||
for idx in employee_idx:
|
||||
events.append(sample_event(rng, employees[int(idx)]))
|
||||
|
||||
df = pd.DataFrame(events)
|
||||
validate_dataset(df)
|
||||
|
||||
if output_path:
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
df.to_csv(output_path, index=False, encoding='utf-8-sig')
|
||||
return df
|
||||
|
||||
|
||||
def ensure_dataset():
|
||||
if not os.path.exists(config.RAW_DATA_PATH):
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
return
|
||||
|
||||
try:
|
||||
df = pd.read_csv(config.RAW_DATA_PATH)
|
||||
validate_dataset(df)
|
||||
except Exception:
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataset = generate_dataset(config.RAW_DATA_PATH)
|
||||
print(f'Generated dataset: {config.RAW_DATA_PATH}')
|
||||
print(dataset.head())
|
||||
Reference in New Issue
Block a user