Files
forsetsystem/backend/core/generate_dataset.py
shuo e8235bf3ca feat: 添加 JD-R 理论分析模块与 SHAP 可解释性分析功能
- 后端新增 JD-R(工作要求-资源)理论维度数据生成,包含工作要求、工作资源、
    个人资源、中介变量共 16 个新特征列
  - 新增 JD-R 分析服务与 API(维度统计、倦怠投入分析、双路径中介分析、
    分组轮廓、风险分布)
  - 新增 SHAP 可解释性分析模块(全局重要性、局部解释、特征交互、依赖图)
  - 预测服务增加风险分类模型加载与概率预测能力
  - 前端新增 JD-R 分析页面(JDRAnalysis.vue),含雷达图、散点图、路径分析等可视化
  - 预测页面增加风险概率展示与 SHAP 特征解释
  - 路由与导航菜单同步更新
2026-04-04 07:15:46 +08:00

571 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import numpy as np
import pandas as pd
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
INDUSTRIES = {
'制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8},
'互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2},
'零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3},
'物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9},
'金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1},
'医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5},
'建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3},
}
def season_from_month(month):
if month in [12, 1, 2]:
return 1
if month in [3, 4, 5]:
return 2
if month in [6, 7, 8]:
return 3
return 4
def weighted_choice(rng, items, probs):
probs = np.array(probs, dtype=float)
probs = probs / probs.sum()
return rng.choice(items, p=probs)
def build_company_pool(rng, company_count=180):
industries = list(INDUSTRIES.keys())
scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上']
city_tiers = ['一线', '新一线', '二线', '三线及以下']
companies = []
for idx in range(company_count):
industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14])
companies.append({
'企业编号': f'C{idx + 1:03d}',
'所属行业': industry,
'企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]),
'所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]),
})
return companies
def build_employee_pool(rng, companies, employee_count=2600):
genders = ['', '']
employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生']
departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营']
job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席']
job_levels = ['初级', '中级', '高级', '主管', '经理及以上']
educations = ['中专及以下', '大专', '本科', '硕士', '博士']
marital = ['未婚', '已婚', '离异/其他']
housing = ['自有住房', '租房', '宿舍']
shifts = ['标准白班', '两班倒', '三班倒', '弹性班']
performance = ['A', 'B', 'C', 'D']
stress = ['', '', '']
employees = []
for idx in range(employee_count):
company = companies[rng.integers(0, len(companies))]
industry = company['所属行业']
age = int(np.clip(rng.normal(33, 7), 20, 55))
tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1)
family_bias = 0.6 if age >= 30 else 0.25
married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1])
children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3))
industry_profile = INDUSTRIES[industry]
shift = weighted_choice(
rng,
shifts,
[
max(0.1, 1 - industry_profile['shift_bias']),
0.35 * industry_profile['shift_bias'],
0.25 * industry_profile['shift_bias'],
0.2,
],
)
night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias']))
overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90))
commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130))
commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65))
performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1])
chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01))
check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14)
sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1)
exercise = int(np.clip(rng.poisson(2.2), 0, 7))
smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08))
drinking = int(rng.random() < 0.27)
stress_level = weighted_choice(
rng,
stress,
[0.22, 0.52, 0.26 + min(0.15, overtime / 120)],
)
bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1)
history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8))
history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120))
discipline = int(np.clip(rng.poisson(0.2), 0, 4))
team_size = int(np.clip(rng.normal(11, 5), 3, 40))
manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60))
local_hukou = int(rng.random() < 0.58)
cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35))
sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3)
employees.append({
'企业编号': company['企业编号'],
'所属行业': industry,
'企业规模': company['企业规模'],
'所在城市等级': company['所在城市等级'],
'用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]),
'部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]),
'岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]),
'岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]),
'员工编号': f'E{idx + 1:05d}',
'性别': weighted_choice(rng, genders, [0.56, 0.44]),
'年龄': age,
'司龄年数': tenure,
'最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]),
'婚姻状态': married,
'是否本地户籍': local_hukou,
'子女数量': children,
'是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)),
'居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]),
'班次类型': shift,
'是否夜班岗位': night_flag,
'月均加班时长': round(overtime, 1),
'近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)),
'近90天缺勤次数': history_count,
'近180天请假总时长': round(history_hours, 1),
'通勤时长分钟': round(commute_minutes, 1),
'通勤距离公里': round(commute_km, 1),
'是否跨城通勤': cross_city,
'绩效等级': performance_level,
'近12月违纪次数': discipline,
'团队人数': team_size,
'直属上级管理跨度': manager_span,
'BMI': bmi,
'是否慢性病史': chronic_flag,
'年度体检异常标记': check_abnormal,
'近30天睡眠时长均值': sleep_hours,
'每周运动频次': exercise,
'是否吸烟': smoking,
'是否饮酒': drinking,
'心理压力等级': stress_level,
'是否长期久坐岗位': sedentary,
})
return employees
def sample_event(rng, employee):
month = int(rng.integers(1, 13))
weekday = int(rng.integers(1, 8))
near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
leave_probs = [0.26, 0.22, 0.11, 0.14, 0.03, 0.02, 0.07, 0.03, 0.12]
if employee['是否慢性病史'] == 1 or employee['年度体检异常标记'] == 1:
leave_probs = [0.34, 0.18, 0.08, 0.1, 0.02, 0.02, 0.08, 0.04, 0.14]
elif employee['子女数量'] >= 2:
leave_probs = [0.22, 0.24, 0.1, 0.12, 0.03, 0.02, 0.12, 0.02, 0.13]
leave_type = weighted_choice(rng, leave_type_items, leave_probs)
if leave_type in ['病假', '工伤假']:
reason_category = weighted_choice(rng, ['身体不适', '就医复查', '职业疲劳'], [0.52, 0.3, 0.18])
elif leave_type == '产检育儿假':
reason_category = weighted_choice(rng, ['子女照护', '家庭事务', '身体不适'], [0.6, 0.25, 0.15])
elif leave_type in ['婚假', '丧假']:
reason_category = weighted_choice(rng, ['家庭事务', '突发事件'], [0.72, 0.28])
elif leave_type in ['年假', '调休']:
reason_category = weighted_choice(rng, ['职业疲劳', '家庭事务', '交通受阻'], [0.52, 0.28, 0.2])
else:
reason_category = weighted_choice(
rng,
['身体不适', '家庭事务', '子女照护', '交通受阻', '突发事件', '职业疲劳'],
[0.2, 0.22, 0.14, 0.12, 0.12, 0.2],
)
medical_certificate = int(
leave_type in ['病假', '工伤假']
or reason_category in ['身体不适', '就医复查']
or (employee['是否慢性病史'] == 1 and leave_type == '其他')
)
urgent_leave = int(
leave_type in ['病假', '工伤假']
or reason_category in ['突发事件', '身体不适']
or (near_holiday == 0 and leave_type == '事假' and rng.random() < 0.35)
)
continuous_absence = int(
leave_type in ['病假', '工伤假', '产检育儿假']
and (employee['近90天缺勤次数'] >= 2 or employee['近180天请假总时长'] >= 28)
)
previous_overtime = int(
employee['月均加班时长'] >= 30
or (employee['月均加班时长'] >= 24 and weekday in [1, 2, 5])
or (employee['是否夜班岗位'] == 1 and rng.random() < 0.65)
)
season = season_from_month(month)
channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])
pressure_score = (
employee['月均加班时长'] * 0.032
+ employee['通勤时长分钟'] * 0.018
+ employee['是否夜班岗位'] * 0.75
+ employee['是否跨城通勤'] * 0.32
+ previous_overtime * 0.35
)
health_score = (
employee['是否慢性病史'] * 1.2
+ employee['年度体检异常标记'] * 0.55
+ (employee['BMI'] >= 28) * 0.3
+ (employee['近30天睡眠时长均值'] < 6.4) * 0.45
)
family_score = employee['子女数量'] * 0.18 + employee['是否独生子女家庭负担'] * 0.28
resilience_score = (
(0.55 if employee['绩效等级'] == 'A' else 0.25 if employee['绩效等级'] == 'B' else 0.0)
+ min(employee['司龄年数'] / 26, 0.65)
+ min(employee['每周运动频次'] * 0.06, 0.25)
)
base = 0.35
base += pressure_score
base += health_score
base += family_score
base += 0.4 if employee['心理压力等级'] == '' else (0.18 if employee['心理压力等级'] == '' else -0.05)
base += 0.18 if near_holiday else 0.0
base += 0.35 if continuous_absence else 0.0
base += 0.28 if employee['近90天缺勤次数'] >= 3 else 0.0
base += 0.18 if employee['近180天请假总时长'] >= 36 else 0.0
base -= resilience_score
leave_bonus = {
'病假': 2.1,
'事假': 0.8,
'年假': 0.15,
'调休': 0.1,
'婚假': 3.1,
'丧假': 2.8,
'产检育儿假': 2.35,
'工伤假': 3.9,
'其他': 0.55,
}
reason_bonus = {
'身体不适': 1.0,
'家庭事务': 0.55,
'子女照护': 0.75,
'交通受阻': 0.2,
'突发事件': 0.6,
'职业疲劳': 0.7,
'就医复查': 1.15,
}
industry_bonus = {
'制造业': 0.42,
'互联网': 0.22,
'零售连锁': 0.28,
'物流运输': 0.5,
'金融服务': 0.12,
'医药健康': 0.24,
'建筑工程': 0.4,
}
season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}
duration = base
duration += leave_bonus[leave_type]
duration += reason_bonus[reason_category]
duration += industry_bonus[employee['所属行业']]
duration += season_bonus[season]
duration += weekday_bonus[weekday]
duration += 0.55 if medical_certificate else 0.0
duration += 0.28 if urgent_leave else -0.06
if leave_type == '病假' and employee['是否慢性病史'] == 1:
duration += 0.85
if leave_type == '工伤假':
duration += 1.0 + employee['是否夜班岗位'] * 0.3
if leave_type in ['婚假', '丧假']:
duration += 0.7 + 0.18 * near_holiday
if leave_type == '产检育儿假':
duration += 0.55 + employee['子女数量'] * 0.12
if leave_type in ['年假', '调休']:
duration *= 0.82 if near_holiday == 0 else 0.9
duration = round(float(np.clip(duration + rng.normal(0, 0.35), 0.5, 18.0)), 1)
event = employee.copy()
event.update({
'缺勤月份': month,
'星期几': weekday,
'是否节假日前后': near_holiday,
'季节': season,
'请假申请渠道': channel,
'请假类型': leave_type,
'请假原因大类': reason_category,
'是否提供医院证明': medical_certificate,
'是否临时请假': urgent_leave,
'是否连续缺勤': continuous_absence,
'前一工作日是否加班': previous_overtime,
'缺勤时长(小时)': duration,
})
return event
def attach_event_timeline(df):
df = df.copy()
rng = np.random.default_rng(config.RANDOM_STATE)
base_date = np.datetime64('2025-01-01')
timelines = []
for employee_id, group in df.groupby('员工编号', sort=False):
group = group.copy().reset_index(drop=True)
event_count = len(group)
offsets = np.sort(rng.integers(0, 365, size=event_count))
group['事件日期'] = [
str(pd.Timestamp(base_date + np.timedelta64(int(offset), 'D')).date())
for offset in offsets
]
group['事件日期索引'] = offsets.astype(int)
group['事件序号'] = np.arange(1, event_count + 1)
group['员工历史事件数'] = event_count
timelines.append(group)
return pd.concat(timelines, ignore_index=True)
def validate_dataset(df):
required_columns = [
'员工编号',
'所属行业',
'岗位序列',
'月均加班时长',
'通勤时长分钟',
'是否慢性病史',
'请假类型',
'事件序号',
'事件日期索引',
'员工历史事件数',
'缺勤时长(小时)',
]
for column in required_columns:
if column not in df.columns:
raise ValueError(f'Missing required column: {column}')
if len(df) < 10000:
raise ValueError('Synthetic dataset is smaller than expected')
if df['员工编号'].nunique() < 2000:
raise ValueError('Employee coverage is too small')
high_risk_ratio = (df['缺勤时长(小时)'] > 8).mean()
if not 0.15 <= high_risk_ratio <= 0.4:
raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}')
medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长(小时)'].mean()
no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长(小时)'].mean()
if medical_mean <= no_medical_mean:
raise ValueError('Medical certificate signal is not effective')
night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长(小时)'].mean()
day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长(小时)'].mean()
if night_mean <= day_mean:
raise ValueError('Night shift signal is not effective')
def generate_dataset(output_path=None, sample_count=12000, random_state=None):
rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state)
companies = build_company_pool(rng)
employees = build_employee_pool(rng, companies)
events = []
employee_idx = rng.integers(0, len(employees), size=sample_count)
for idx in employee_idx:
events.append(sample_event(rng, employees[int(idx)]))
df = attach_event_timeline(pd.DataFrame(events))
validate_dataset(df)
if output_path:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False, encoding='utf-8-sig')
return df
def enrich_with_jdr_columns(df):
"""为现有数据追加 JD-R工作要求-资源)理论维度列。
在已有的员工/事件属性基础上,合成 16 个新列:
- 工作要求:工作自主性、情绪劳动强度、时间压力感知、角色模糊度、工作家庭冲突
- 工作资源:上级支持、同事支持、技能多样性、职业发展机会、参与决策、组织公平感
- 个人资源:自我效能感、心理韧性、乐观程度
- 中介变量:工作倦怠、工作投入
"""
rng = np.random.default_rng(config.RANDOM_STATE + 100)
df = df.copy()
n = len(df)
# ── 辅助:条件性 Likert 生成 ──
def likert(mean_offset, std=0.8, low=1.0, high=5.0):
return np.clip(rng.normal(mean_offset, std, size=n), low, high)
# ── 预提取列 ──
overtime = df['月均加班时长'].values
commute = df['通勤时长分钟'].values
night = df['是否夜班岗位'].values
children = df['子女数量'].values
married_arr = (df['婚姻状态'] == '已婚').astype(int).values
tenure = df['司龄年数'].values
team_size = df['团队人数'].values
manager_span = df['直属上级管理跨度'].values
exercise = df['每周运动频次'].values
sleep = df['近30天睡眠时长均值'].values
chronic = df['是否慢性病史'].values
perf_a = (df['绩效等级'] == 'A').astype(int).values
perf_ab = df['绩效等级'].isin(['A', 'B']).astype(int).values
level_map = {'初级': 0, '中级': 1, '高级': 2, '主管': 3, '经理及以上': 4}
level_vals = df['岗位级别'].map(level_map).fillna(1).values
industry_vals = df['所属行业'].values
employment_type = df['用工类型'].values
job_family = df['岗位序列'].values
company_scale_map = {
'100人以下': 0, '100-499人': 1, '500-999人': 2, '1000-4999人': 3, '5000人及以上': 4
}
scale_vals = df['企业规模'].map(company_scale_map).fillna(1).values
formal_employee = (df['用工类型'] == '正式员工').astype(int).values
edu_map = {'中专及以下': 0, '大专': 1, '本科': 2, '硕士': 3, '博士': 4}
edu_vals = df['最高学历'].map(edu_map).fillna(2).values
# ── 工作要求维度 (5 列) ──
df['工作自主性'] = likert(
3.2 + level_vals * 0.25
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.3
- night * 0.4
).round(1)
df['情绪劳动强度'] = likert(
2.8
+ np.isin(job_family, ['客服坐席', '销售业务']).astype(int) * 0.6
+ np.isin(industry_vals, ['医药健康', '零售连锁']).astype(int) * 0.3
).round(1)
df['时间压力感知'] = likert(
3.0 + overtime * 0.02 + commute * 0.01
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.2
).round(1)
df['角色模糊度'] = likert(
2.5
+ np.isin(employment_type, ['劳务派遣', '外包驻场']).astype(int) * 0.5
- tenure * 0.05
).round(1)
df['工作家庭冲突'] = likert(
2.6 + overtime * 0.02 + children * 0.3 + married_arr * 0.3
).round(1)
# ── 工作资源维度 (6 列) ──
df['上级支持'] = likert(
3.4 - manager_span * 0.02 + level_vals * 0.2
).round(1)
df['同事支持'] = likert(
3.3 + team_size * 0.02
+ np.isin(job_family, ['管理', '专业技术']).astype(int) * 0.2
).round(1)
df['技能多样性'] = likert(
3.0
+ np.isin(job_family, ['专业技术', '管理']).astype(int) * 0.5
- np.isin(job_family, ['生产操作']).astype(int) * 0.3
).round(1)
df['职业发展机会'] = likert(
3.1
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.4
+ scale_vals * 0.1
).round(1)
df['参与决策'] = likert(
2.8 + level_vals * 0.35
).round(1)
df['组织公平感'] = likert(
3.3 + formal_employee * 0.4 + perf_ab * 0.3
).round(1)
# ── 个人资源维度 (3 列) ──
df['自我效能感'] = likert(
3.3 + perf_a * 0.4 + perf_ab * 0.2 + tenure * 0.03 + edu_vals * 0.08
).round(1)
df['心理韧性'] = likert(
3.2 + exercise * 0.1 + sleep * 0.15 + tenure * 0.02
).round(1)
df['乐观程度'] = likert(
3.3 + perf_ab * 0.3 - chronic * 0.3 + married_arr * 0.15
).round(1)
# ── 中介变量 (2 列) ──
# 工作倦怠 (1-7):健康损伤过程 — 高需求→高倦怠
df['工作倦怠'] = np.clip(
rng.normal(3.0, 0.8, size=n)
+ overtime * 0.015 + night * 0.3 + commute * 0.008
+ df['情绪劳动强度'].values * 0.25
+ df['时间压力感知'].values * 0.25
+ df['工作家庭冲突'].values * 0.2
+ df['角色模糊度'].values * 0.15
- df['工作自主性'].values * 0.2
- df['上级支持'].values * 0.15
- df['自我效能感'].values * 0.2
- df['心理韧性'].values * 0.15,
1.0, 7.0
).round(1)
# 工作投入 (1-7):激励过程 — 高资源→高投入
df['工作投入'] = np.clip(
rng.normal(3.5, 0.8, size=n)
+ df['工作自主性'].values * 0.2
+ df['上级支持'].values * 0.2
+ df['同事支持'].values * 0.15
+ df['技能多样性'].values * 0.15
+ df['职业发展机会'].values * 0.15
+ df['参与决策'].values * 0.1
+ df['组织公平感'].values * 0.1
+ df['自我效能感'].values * 0.2
+ df['心理韧性'].values * 0.15
+ df['乐观程度'].values * 0.15
- df['工作倦怠'].values * 0.2,
1.0, 7.0
).round(1)
# JD-R 数据版本标记
df['_jdr_version'] = config.JDR_DATA_VERSION
return df
def ensure_dataset():
needs_regenerate = not os.path.exists(config.RAW_DATA_PATH)
if not needs_regenerate:
try:
df = pd.read_csv(config.RAW_DATA_PATH)
validate_dataset(df)
except Exception:
needs_regenerate = True
if needs_regenerate:
generate_dataset(config.RAW_DATA_PATH)
df = pd.read_csv(config.RAW_DATA_PATH)
# 检查是否需要 JD-R 数据丰富
jdr_columns = ['工作自主性', '上级支持', '自我效能感', '工作倦怠', '工作投入']
if not all(col in df.columns for col in jdr_columns):
df = enrich_with_jdr_columns(df)
os.makedirs(os.path.dirname(config.RAW_DATA_PATH), exist_ok=True)
df.to_csv(config.RAW_DATA_PATH, index=False, encoding='utf-8-sig')
if __name__ == '__main__':
dataset = generate_dataset(config.RAW_DATA_PATH)
print(f'Generated dataset: {config.RAW_DATA_PATH}')
print(dataset.head())