Files
forsetsystem/backend/core/feature_mining.py
shenjianZ e63267cef6 feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工
  - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置
  - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等)
  - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等)
  - 新增 model_features.py 定义模型训练特征
  - 更新 preprocessing.py 和 train_model.py 适配新数据结构
  - 更新各 API 路由默认参数(model: random_forest, dimension: industry)
  - 前端更新主题样式和各视图组件适配中文字段
  - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00

73 lines
2.5 KiB
Python

import numpy as np
import config
from core.preprocessing import get_clean_data
def calculate_correlation():
df = get_clean_data()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for candidate in [config.EMPLOYEE_ID_COLUMN]:
if candidate in numeric_cols:
numeric_cols.remove(candidate)
return df[numeric_cols].corr()
def get_correlation_for_heatmap():
corr_matrix = calculate_correlation()
key_features = [
'月均加班时长',
'通勤时长分钟',
'近90天缺勤次数',
'BMI',
'近30天睡眠时长均值',
'缺勤时长(小时)',
]
key_features = [f for f in key_features if f in corr_matrix.columns]
sub_matrix = corr_matrix.loc[key_features, key_features]
return {
'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features],
'matrix': sub_matrix.values.round(2).tolist(),
}
def group_comparison(dimension):
df = get_clean_data()
dimension_map = {
'industry': ('所属行业', None, '所属行业'),
'shift_type': ('班次类型', None, '班次类型'),
'job_family': ('岗位序列', None, '岗位序列'),
'marital_status': ('婚姻状态', None, '婚姻状态'),
'chronic_disease': ('是否慢性病史', {0: '无慢性病史', 1: '有慢性病史'}, '慢性病史'),
}
if dimension not in dimension_map:
raise ValueError(f"Invalid dimension: {dimension}")
column, value_map, dimension_name = dimension_map[dimension]
groups = []
for value in sorted(df[column].unique()):
group_df = df[df[column] == value]
groups.append({
'name': value_map.get(value, value) if value_map else str(value),
'value': int(value) if isinstance(value, (int, np.integer)) else str(value),
'avg_hours': round(group_df[config.TARGET_COLUMN].mean(), 2),
'count': int(len(group_df)),
'percentage': round(len(group_df) / len(df) * 100, 1),
})
groups.sort(key=lambda item: item['avg_hours'], reverse=True)
top = groups[0]['avg_hours'] if groups else 0
bottom = groups[-1]['avg_hours'] if len(groups) > 1 else 0
diff_value = round(top - bottom, 2)
diff_percentage = round(diff_value / bottom * 100, 1) if bottom else 0
return {
'dimension': dimension,
'dimension_name': dimension_name,
'groups': groups,
'difference': {
'value': diff_value,
'percentage': diff_percentage,
},
}