feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据
- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py),覆盖7个行业、180家企业、2600名员工 - 重构 config.py,更新特征字段为中文名称,调整目标列、员工ID、行业类型等配置 - 重构 clustering.py,简化聚类逻辑,更新聚类特征和群体命名(高压通勤型、健康波动型等) - 重构 feature_mining.py,更新相关性分析和群体比较维度(按行业、班次、婚姻状态等) - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数(model: random_forest, dimension: industry) - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
This commit is contained in:
@@ -8,7 +8,7 @@ analysis_bp = Blueprint('analysis', __name__, url_prefix='/api/analysis')
|
||||
@analysis_bp.route('/importance', methods=['GET'])
|
||||
def get_importance():
|
||||
try:
|
||||
model_type = request.args.get('model', 'rf')
|
||||
model_type = request.args.get('model', 'random_forest')
|
||||
result = analysis_service.get_feature_importance(model_type)
|
||||
return jsonify({
|
||||
'code': 200,
|
||||
@@ -43,7 +43,7 @@ def get_correlation():
|
||||
@analysis_bp.route('/compare', methods=['GET'])
|
||||
def get_compare():
|
||||
try:
|
||||
dimension = request.args.get('dimension', 'drinker')
|
||||
dimension = request.args.get('dimension', 'industry')
|
||||
result = analysis_service.get_group_comparison(dimension)
|
||||
return jsonify({
|
||||
'code': 200,
|
||||
|
||||
@@ -49,8 +49,8 @@ def get_profile():
|
||||
def get_scatter():
|
||||
try:
|
||||
n_clusters = request.args.get('n_clusters', 3, type=int)
|
||||
x_axis = request.args.get('x_axis', 'Age')
|
||||
y_axis = request.args.get('y_axis', 'Absenteeism time in hours')
|
||||
x_axis = request.args.get('x_axis', '月均加班时长')
|
||||
y_axis = request.args.get('y_axis', '缺勤时长(小时)')
|
||||
|
||||
n_clusters = max(2, min(10, n_clusters))
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ def create_app():
|
||||
def index():
|
||||
return {
|
||||
'code': 200,
|
||||
'message': 'Employee Absenteeism Analysis System API',
|
||||
'message': 'China Enterprise Absence Analysis System API',
|
||||
'data': {
|
||||
'version': '1.0.0',
|
||||
'endpoints': {
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
import os
|
||||
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
DATA_DIR = os.path.join(BASE_DIR, 'data')
|
||||
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
|
||||
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
|
||||
|
||||
MODELS_DIR = os.path.join(BASE_DIR, 'models')
|
||||
|
||||
RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'Absenteeism_at_work.csv')
|
||||
RAW_DATA_FILENAME = 'china_enterprise_absence_events.csv'
|
||||
RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, RAW_DATA_FILENAME)
|
||||
CLEAN_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'clean_data.csv')
|
||||
|
||||
RF_MODEL_PATH = os.path.join(MODELS_DIR, 'rf_model.pkl')
|
||||
@@ -17,132 +18,127 @@ KMEANS_MODEL_PATH = os.path.join(MODELS_DIR, 'kmeans_model.pkl')
|
||||
SCALER_PATH = os.path.join(MODELS_DIR, 'scaler.pkl')
|
||||
ENCODER_PATH = os.path.join(MODELS_DIR, 'encoder.pkl')
|
||||
|
||||
CSV_SEPARATOR = ';'
|
||||
|
||||
CSV_SEPARATOR = ','
|
||||
RANDOM_STATE = 42
|
||||
TEST_SIZE = 0.2
|
||||
|
||||
FEATURE_NAMES = [
|
||||
'ID',
|
||||
'Reason for absence',
|
||||
'Month of absence',
|
||||
'Day of the week',
|
||||
'Seasons',
|
||||
'Transportation expense',
|
||||
'Distance from Residence to Work',
|
||||
'Service time',
|
||||
'Age',
|
||||
'Work load Average/day ',
|
||||
'Hit target',
|
||||
'Disciplinary failure',
|
||||
'Education',
|
||||
'Son',
|
||||
'Social drinker',
|
||||
'Social smoker',
|
||||
'Pet',
|
||||
'Weight',
|
||||
'Height',
|
||||
'Body mass index',
|
||||
'Absenteeism time in hours'
|
||||
]
|
||||
|
||||
CATEGORICAL_FEATURES = [
|
||||
'Reason for absence',
|
||||
'Month of absence',
|
||||
'Day of the week',
|
||||
'Seasons',
|
||||
'Disciplinary failure',
|
||||
'Education',
|
||||
'Social drinker',
|
||||
'Social smoker'
|
||||
]
|
||||
|
||||
NUMERICAL_FEATURES = [
|
||||
'Transportation expense',
|
||||
'Distance from Residence to Work',
|
||||
'Service time',
|
||||
'Age',
|
||||
'Work load Average/day ',
|
||||
'Hit target',
|
||||
'Son',
|
||||
'Pet',
|
||||
'Body mass index'
|
||||
]
|
||||
|
||||
REASON_NAMES = {
|
||||
0: '未知原因',
|
||||
1: '传染病',
|
||||
2: '肿瘤',
|
||||
3: '血液疾病',
|
||||
4: '内分泌疾病',
|
||||
5: '精神行为障碍',
|
||||
6: '神经系统疾病',
|
||||
7: '眼部疾病',
|
||||
8: '耳部疾病',
|
||||
9: '循环系统疾病',
|
||||
10: '呼吸系统疾病',
|
||||
11: '消化系统疾病',
|
||||
12: '皮肤疾病',
|
||||
13: '肌肉骨骼疾病',
|
||||
14: '泌尿生殖疾病',
|
||||
15: '妊娠相关',
|
||||
16: '围产期疾病',
|
||||
17: '先天性畸形',
|
||||
18: '症状体征',
|
||||
19: '损伤中毒',
|
||||
20: '外部原因',
|
||||
21: '健康因素',
|
||||
22: '医疗随访',
|
||||
23: '医疗咨询',
|
||||
24: '献血',
|
||||
25: '实验室检查',
|
||||
26: '无故缺勤',
|
||||
27: '理疗',
|
||||
28: '牙科咨询'
|
||||
}
|
||||
TARGET_COLUMN = '缺勤时长(小时)'
|
||||
EMPLOYEE_ID_COLUMN = '员工编号'
|
||||
COMPANY_ID_COLUMN = '企业编号'
|
||||
|
||||
WEEKDAY_NAMES = {
|
||||
2: '周一',
|
||||
3: '周二',
|
||||
4: '周三',
|
||||
5: '周四',
|
||||
6: '周五'
|
||||
1: '周一',
|
||||
2: '周二',
|
||||
3: '周三',
|
||||
4: '周四',
|
||||
5: '周五',
|
||||
6: '周六',
|
||||
7: '周日',
|
||||
}
|
||||
|
||||
SEASON_NAMES = {
|
||||
1: '夏季',
|
||||
2: '秋季',
|
||||
3: '冬季',
|
||||
4: '春季'
|
||||
1: '冬季',
|
||||
2: '春季',
|
||||
3: '夏季',
|
||||
4: '秋季',
|
||||
}
|
||||
|
||||
EDUCATION_NAMES = {
|
||||
1: '高中',
|
||||
2: '本科',
|
||||
3: '研究生',
|
||||
4: '博士'
|
||||
}
|
||||
INDUSTRY_NAMES = [
|
||||
'制造业',
|
||||
'互联网',
|
||||
'零售连锁',
|
||||
'物流运输',
|
||||
'金融服务',
|
||||
'医药健康',
|
||||
'建筑工程',
|
||||
]
|
||||
|
||||
LEAVE_TYPE_NAMES = [
|
||||
'病假',
|
||||
'事假',
|
||||
'年假',
|
||||
'调休',
|
||||
'婚假',
|
||||
'丧假',
|
||||
'产检育儿假',
|
||||
'工伤假',
|
||||
'其他',
|
||||
]
|
||||
|
||||
REASON_CATEGORY_NAMES = [
|
||||
'身体不适',
|
||||
'家庭事务',
|
||||
'子女照护',
|
||||
'交通受阻',
|
||||
'突发事件',
|
||||
'职业疲劳',
|
||||
'就医复查',
|
||||
]
|
||||
|
||||
FEATURE_NAME_CN = {
|
||||
'ID': '员工标识',
|
||||
'Reason for absence': '缺勤原因',
|
||||
'Month of absence': '缺勤月份',
|
||||
'Day of the week': '星期几',
|
||||
'Seasons': '季节',
|
||||
'Transportation expense': '交通费用',
|
||||
'Distance from Residence to Work': '通勤距离',
|
||||
'Service time': '工龄',
|
||||
'Age': '年龄',
|
||||
'Work load Average/day ': '日均工作负荷',
|
||||
'Hit target': '达标率',
|
||||
'Disciplinary failure': '违纪记录',
|
||||
'Education': '学历',
|
||||
'Son': '子女数量',
|
||||
'Social drinker': '饮酒习惯',
|
||||
'Social smoker': '吸烟习惯',
|
||||
'Pet': '宠物数量',
|
||||
'Weight': '体重',
|
||||
'Height': '身高',
|
||||
'Body mass index': 'BMI指数',
|
||||
'Absenteeism time in hours': '缺勤时长'
|
||||
'企业编号': '企业编号',
|
||||
'所属行业': '所属行业',
|
||||
'企业规模': '企业规模',
|
||||
'所在城市等级': '所在城市等级',
|
||||
'用工类型': '用工类型',
|
||||
'部门条线': '部门条线',
|
||||
'岗位序列': '岗位序列',
|
||||
'岗位级别': '岗位级别',
|
||||
'员工编号': '员工编号',
|
||||
'性别': '性别',
|
||||
'年龄': '年龄',
|
||||
'司龄年数': '司龄年数',
|
||||
'最高学历': '最高学历',
|
||||
'婚姻状态': '婚姻状态',
|
||||
'是否本地户籍': '是否本地户籍',
|
||||
'子女数量': '子女数量',
|
||||
'是否独生子女家庭负担': '独生子女家庭负担',
|
||||
'居住类型': '居住类型',
|
||||
'班次类型': '班次类型',
|
||||
'是否夜班岗位': '是否夜班岗位',
|
||||
'月均加班时长': '月均加班时长',
|
||||
'近30天出勤天数': '近30天出勤天数',
|
||||
'近90天缺勤次数': '近90天缺勤次数',
|
||||
'近180天请假总时长': '近180天请假总时长',
|
||||
'通勤时长分钟': '通勤时长分钟',
|
||||
'通勤距离公里': '通勤距离公里',
|
||||
'是否跨城通勤': '是否跨城通勤',
|
||||
'绩效等级': '绩效等级',
|
||||
'近12月违纪次数': '近12月违纪次数',
|
||||
'团队人数': '团队人数',
|
||||
'直属上级管理跨度': '直属上级管理跨度',
|
||||
'BMI': 'BMI',
|
||||
'是否慢性病史': '是否慢性病史',
|
||||
'年度体检异常标记': '年度体检异常',
|
||||
'近30天睡眠时长均值': '睡眠时长',
|
||||
'每周运动频次': '运动频次',
|
||||
'是否吸烟': '是否吸烟',
|
||||
'是否饮酒': '是否饮酒',
|
||||
'心理压力等级': '心理压力等级',
|
||||
'是否长期久坐岗位': '是否久坐岗位',
|
||||
'缺勤月份': '缺勤月份',
|
||||
'星期几': '星期几',
|
||||
'是否节假日前后': '节假日前后',
|
||||
'季节': '季节',
|
||||
'请假申请渠道': '请假申请渠道',
|
||||
'请假类型': '请假类型',
|
||||
'请假原因大类': '请假原因大类',
|
||||
'是否提供医院证明': '医院证明',
|
||||
'是否临时请假': '临时请假',
|
||||
'是否连续缺勤': '连续缺勤',
|
||||
'前一工作日是否加班': '前一工作日加班',
|
||||
'缺勤时长(小时)': '缺勤时长',
|
||||
'加班通勤压力指数': '加班通勤压力指数',
|
||||
'家庭负担指数': '家庭负担指数',
|
||||
'健康风险指数': '健康风险指数',
|
||||
'岗位稳定性指数': '岗位稳定性指数',
|
||||
'节假日风险标记': '节假日风险标记',
|
||||
'排班压力标记': '排班压力标记',
|
||||
'缺勤历史强度': '缺勤历史强度',
|
||||
'生活规律指数': '生活规律指数',
|
||||
'管理负荷指数': '管理负荷指数',
|
||||
'工龄分层': '工龄分层',
|
||||
'年龄分层': '年龄分层',
|
||||
'通勤分层': '通勤分层',
|
||||
'加班分层': '加班分层',
|
||||
}
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
import joblib
|
||||
import os
|
||||
|
||||
import config
|
||||
from core.preprocessing import get_clean_data
|
||||
@@ -14,216 +11,123 @@ class KMeansAnalyzer:
|
||||
self.n_clusters = n_clusters
|
||||
self.model = None
|
||||
self.scaler = MinMaxScaler()
|
||||
self.data = None
|
||||
self.data_scaled = None
|
||||
self.labels = None
|
||||
|
||||
def _get_feature_columns(self, df):
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
feature_map = {
|
||||
'Age': None,
|
||||
'Service time': None,
|
||||
'Work load Average/day': None,
|
||||
'Body mass index': None,
|
||||
'Absenteeism time in hours': None
|
||||
}
|
||||
|
||||
for key in feature_map:
|
||||
if key in df.columns:
|
||||
feature_map[key] = key
|
||||
else:
|
||||
for col in df.columns:
|
||||
if key.replace(' ', '').lower() == col.replace(' ', '').lower():
|
||||
feature_map[key] = col
|
||||
break
|
||||
|
||||
actual_features = [v for v in feature_map.values() if v is not None]
|
||||
return actual_features
|
||||
|
||||
self.feature_cols = [
|
||||
'年龄',
|
||||
'司龄年数',
|
||||
'月均加班时长',
|
||||
'通勤时长分钟',
|
||||
'BMI',
|
||||
'缺勤时长(小时)',
|
||||
]
|
||||
|
||||
def fit(self, n_clusters=None):
|
||||
if n_clusters:
|
||||
self.n_clusters = n_clusters
|
||||
|
||||
df = get_clean_data()
|
||||
df = df.reset_index(drop=True)
|
||||
|
||||
feature_cols = self._get_feature_columns(df)
|
||||
|
||||
if not feature_cols:
|
||||
feature_cols = ['Age', 'Service time', 'Body mass index', 'Absenteeism time in hours']
|
||||
feature_cols = [c for c in feature_cols if c in df.columns]
|
||||
|
||||
self.data = df[feature_cols].values
|
||||
|
||||
self.scaler = MinMaxScaler()
|
||||
self.data_scaled = self.scaler.fit_transform(self.data)
|
||||
|
||||
self.model = KMeans(
|
||||
n_clusters=self.n_clusters,
|
||||
random_state=config.RANDOM_STATE,
|
||||
n_init=10
|
||||
)
|
||||
|
||||
self.labels = self.model.fit_predict(self.data_scaled)
|
||||
|
||||
df = get_clean_data().reset_index(drop=True)
|
||||
data = df[self.feature_cols].values
|
||||
data_scaled = self.scaler.fit_transform(data)
|
||||
self.model = KMeans(n_clusters=self.n_clusters, random_state=config.RANDOM_STATE, n_init=10)
|
||||
self.labels = self.model.fit_predict(data_scaled)
|
||||
return self.model
|
||||
|
||||
|
||||
def get_cluster_results(self, n_clusters=3):
|
||||
if self.model is None or self.n_clusters != n_clusters:
|
||||
self.fit(n_clusters)
|
||||
|
||||
centers = self.scaler.inverse_transform(self.model.cluster_centers_)
|
||||
|
||||
unique, counts = np.unique(self.labels, return_counts=True)
|
||||
total = len(self.labels)
|
||||
|
||||
cluster_names = self._generate_cluster_names(centers)
|
||||
|
||||
feature_cols = self._get_feature_columns(get_clean_data())
|
||||
|
||||
names = self._generate_cluster_names(centers)
|
||||
clusters = []
|
||||
for i, (cluster_id, count) in enumerate(zip(unique, counts)):
|
||||
center_dict = {}
|
||||
for j, fname in enumerate(feature_cols):
|
||||
if j < len(centers[i]):
|
||||
center_dict[fname] = round(centers[i][j], 2)
|
||||
|
||||
for cluster_id, count in zip(unique, counts):
|
||||
center = centers[int(cluster_id)]
|
||||
clusters.append({
|
||||
'id': int(cluster_id),
|
||||
'name': cluster_names.get(cluster_id, f'群体{cluster_id+1}'),
|
||||
'name': names.get(int(cluster_id), f'群体{int(cluster_id) + 1}'),
|
||||
'member_count': int(count),
|
||||
'percentage': round(count / total * 100, 1),
|
||||
'center': center_dict,
|
||||
'description': self._generate_description(cluster_names.get(cluster_id, ''))
|
||||
'center': {
|
||||
feature: round(float(value), 2)
|
||||
for feature, value in zip(self.feature_cols, center)
|
||||
},
|
||||
'description': self._generate_description(names.get(int(cluster_id), '')),
|
||||
})
|
||||
|
||||
return {
|
||||
'n_clusters': self.n_clusters,
|
||||
'clusters': clusters
|
||||
}
|
||||
|
||||
return {'n_clusters': self.n_clusters, 'clusters': clusters}
|
||||
|
||||
def get_cluster_profile(self, n_clusters=3):
|
||||
if self.model is None or self.n_clusters != n_clusters:
|
||||
self.fit(n_clusters)
|
||||
|
||||
centers_scaled = self.model.cluster_centers_
|
||||
|
||||
df = get_clean_data()
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
feature_cols = self._get_feature_columns(df)
|
||||
|
||||
dimensions = ['年龄', '工龄', '工作负荷', 'BMI', '缺勤倾向'][:len(feature_cols)]
|
||||
|
||||
cluster_names = self._generate_cluster_names(
|
||||
self.scaler.inverse_transform(centers_scaled)
|
||||
)
|
||||
|
||||
clusters = []
|
||||
for i in range(self.n_clusters):
|
||||
clusters.append({
|
||||
'id': i,
|
||||
'name': cluster_names.get(i, f'群体{i+1}'),
|
||||
'values': [round(v, 2) for v in centers_scaled[i]]
|
||||
})
|
||||
|
||||
names = self._generate_cluster_names(self.scaler.inverse_transform(centers_scaled))
|
||||
return {
|
||||
'dimensions': dimensions,
|
||||
'dimension_keys': feature_cols,
|
||||
'clusters': clusters
|
||||
'dimensions': ['年龄', '司龄', '加班', '通勤', 'BMI', '缺勤'],
|
||||
'dimension_keys': self.feature_cols,
|
||||
'clusters': [
|
||||
{
|
||||
'id': idx,
|
||||
'name': names.get(idx, f'群体{idx + 1}'),
|
||||
'values': [round(float(v), 2) for v in centers_scaled[idx]],
|
||||
}
|
||||
for idx in range(self.n_clusters)
|
||||
],
|
||||
}
|
||||
|
||||
def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
|
||||
|
||||
def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'):
|
||||
if self.model is None or self.n_clusters != n_clusters:
|
||||
self.fit(n_clusters)
|
||||
|
||||
df = get_clean_data()
|
||||
df = df.reset_index(drop=True)
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
x_col = None
|
||||
y_col = None
|
||||
|
||||
for col in df.columns:
|
||||
if x_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
|
||||
x_col = col
|
||||
if y_axis.replace(' ', '').lower() in col.replace(' ', '').lower():
|
||||
y_col = col
|
||||
|
||||
if x_col is None:
|
||||
x_col = df.columns[0]
|
||||
if y_col is None:
|
||||
y_col = df.columns[-1]
|
||||
|
||||
df = get_clean_data().reset_index(drop=True)
|
||||
if x_axis not in df.columns:
|
||||
x_axis = '月均加班时长'
|
||||
if y_axis not in df.columns:
|
||||
y_axis = config.TARGET_COLUMN
|
||||
points = []
|
||||
for idx in range(min(len(df), len(self.labels))):
|
||||
row = df.iloc[idx]
|
||||
points.append({
|
||||
'employee_id': int(row['ID']),
|
||||
'x': float(row[x_col]),
|
||||
'y': float(row[y_col]),
|
||||
'cluster_id': int(self.labels[idx])
|
||||
'employee_id': str(row[config.EMPLOYEE_ID_COLUMN]),
|
||||
'x': float(row[x_axis]),
|
||||
'y': float(row[y_axis]),
|
||||
'cluster_id': int(self.labels[idx]),
|
||||
})
|
||||
|
||||
cluster_colors = {
|
||||
'0': '#67C23A',
|
||||
'1': '#E6A23C',
|
||||
'2': '#F56C6C',
|
||||
'3': '#909399',
|
||||
'4': '#409EFF'
|
||||
}
|
||||
|
||||
return {
|
||||
'x_axis': x_col,
|
||||
'x_axis_name': config.FEATURE_NAME_CN.get(x_col, x_col),
|
||||
'y_axis': y_col,
|
||||
'y_axis_name': config.FEATURE_NAME_CN.get(y_col, y_col),
|
||||
'x_axis': x_axis,
|
||||
'x_axis_name': config.FEATURE_NAME_CN.get(x_axis, x_axis),
|
||||
'y_axis': y_axis,
|
||||
'y_axis_name': config.FEATURE_NAME_CN.get(y_axis, y_axis),
|
||||
'points': points[:500],
|
||||
'cluster_colors': cluster_colors
|
||||
'cluster_colors': {
|
||||
'0': '#5B8FF9',
|
||||
'1': '#61DDAA',
|
||||
'2': '#F6BD16',
|
||||
'3': '#E8684A',
|
||||
'4': '#6DC8EC',
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _generate_cluster_names(self, centers):
|
||||
names = {}
|
||||
|
||||
for i, center in enumerate(centers):
|
||||
if len(center) >= 5:
|
||||
service_time = center[1]
|
||||
work_load = center[2]
|
||||
bmi = center[3]
|
||||
absent = center[4]
|
||||
for idx, center in enumerate(centers):
|
||||
_, tenure, overtime, commute, bmi, absence = center
|
||||
if overtime > 38 and commute > 55 and absence > 8:
|
||||
names[idx] = '高压通勤型'
|
||||
elif bmi > 27 and absence > 8:
|
||||
names[idx] = '健康波动型'
|
||||
elif tenure > 8 and absence < 6:
|
||||
names[idx] = '稳定低风险型'
|
||||
elif overtime > 28 and absence > 7:
|
||||
names[idx] = '轮班负荷型'
|
||||
else:
|
||||
service_time = center[1] if len(center) > 1 else 0
|
||||
work_load = 0
|
||||
bmi = center[2] if len(center) > 2 else 0
|
||||
absent = center[3] if len(center) > 3 else 0
|
||||
|
||||
if service_time > 15 and absent < 3:
|
||||
names[i] = '模范型员工'
|
||||
elif work_load > 260 and absent > 5:
|
||||
names[i] = '压力型员工'
|
||||
elif bmi > 28:
|
||||
names[i] = '生活习惯型员工'
|
||||
else:
|
||||
names[i] = f'群体{i+1}'
|
||||
|
||||
names[idx] = f'群体{idx + 1}'
|
||||
return names
|
||||
|
||||
|
||||
def _generate_description(self, name):
|
||||
descriptions = {
|
||||
'模范型员工': '工龄长、工作稳定、缺勤率低',
|
||||
'压力型员工': '工作负荷大、缺勤较多',
|
||||
'生活习惯型员工': 'BMI偏高、需关注健康'
|
||||
'高压通勤型': '加班和通勤压力都高,缺勤时长偏长。',
|
||||
'健康波动型': '健康相关风险更高,需要重点关注。',
|
||||
'稳定低风险型': '司龄较长,缺勤水平稳定且偏低。',
|
||||
'轮班负荷型': '排班和工作负荷较重,缺勤风险较高。',
|
||||
}
|
||||
return descriptions.get(name, '常规员工群体')
|
||||
|
||||
def save_model(self):
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
joblib.dump(self.model, config.KMEANS_MODEL_PATH)
|
||||
|
||||
def load_model(self):
|
||||
if os.path.exists(config.KMEANS_MODEL_PATH):
|
||||
self.model = joblib.load(config.KMEANS_MODEL_PATH)
|
||||
self.n_clusters = self.model.n_clusters
|
||||
return descriptions.get(name, '常规员工群体。')
|
||||
|
||||
|
||||
kmeans_analyzer = KMeansAnalyzer()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
@@ -7,145 +6,67 @@ from core.preprocessing import get_clean_data
|
||||
|
||||
def calculate_correlation():
|
||||
df = get_clean_data()
|
||||
|
||||
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
|
||||
if 'ID' in numeric_cols:
|
||||
numeric_cols.remove('ID')
|
||||
|
||||
corr_matrix = df[numeric_cols].corr()
|
||||
|
||||
return corr_matrix
|
||||
for candidate in [config.EMPLOYEE_ID_COLUMN]:
|
||||
if candidate in numeric_cols:
|
||||
numeric_cols.remove(candidate)
|
||||
return df[numeric_cols].corr()
|
||||
|
||||
|
||||
def get_correlation_for_heatmap():
|
||||
corr_matrix = calculate_correlation()
|
||||
|
||||
key_features = [
|
||||
'Age',
|
||||
'Service time',
|
||||
'Distance from Residence to Work',
|
||||
'Work load Average/day ',
|
||||
'Body mass index',
|
||||
'Absenteeism time in hours'
|
||||
'月均加班时长',
|
||||
'通勤时长分钟',
|
||||
'近90天缺勤次数',
|
||||
'BMI',
|
||||
'近30天睡眠时长均值',
|
||||
'缺勤时长(小时)',
|
||||
]
|
||||
|
||||
key_features = [f for f in key_features if f in corr_matrix.columns]
|
||||
|
||||
sub_matrix = corr_matrix.loc[key_features, key_features]
|
||||
|
||||
result = {
|
||||
return {
|
||||
'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features],
|
||||
'matrix': sub_matrix.values.round(2).tolist()
|
||||
'matrix': sub_matrix.values.round(2).tolist(),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def calculate_feature_importance(model, feature_names):
|
||||
if hasattr(model, 'feature_importances_'):
|
||||
importance = model.feature_importances_
|
||||
else:
|
||||
raise ValueError("Model does not have feature_importances_ attribute")
|
||||
|
||||
importance_dict = dict(zip(feature_names, importance))
|
||||
|
||||
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
return sorted_importance
|
||||
|
||||
|
||||
def get_feature_importance_from_model(model_path, feature_names):
|
||||
import joblib
|
||||
|
||||
model = joblib.load(model_path)
|
||||
return calculate_feature_importance(model, feature_names)
|
||||
|
||||
|
||||
def group_comparison(dimension):
|
||||
df = get_clean_data()
|
||||
|
||||
dimension_map = {
|
||||
'drinker': ('Social drinker', {0: '不饮酒', 1: '饮酒'}),
|
||||
'smoker': ('Social smoker', {0: '不吸烟', 1: '吸烟'}),
|
||||
'education': ('Education', {1: '高中', 2: '本科', 3: '研究生', 4: '博士'}),
|
||||
'children': ('Son', {0: '无子女'}, lambda x: x > 0, '有子女'),
|
||||
'pet': ('Pet', {0: '无宠物'}, lambda x: x > 0, '有宠物')
|
||||
'industry': ('所属行业', None, '所属行业'),
|
||||
'shift_type': ('班次类型', None, '班次类型'),
|
||||
'job_family': ('岗位序列', None, '岗位序列'),
|
||||
'marital_status': ('婚姻状态', None, '婚姻状态'),
|
||||
'chronic_disease': ('是否慢性病史', {0: '无慢性病史', 1: '有慢性病史'}, '慢性病史'),
|
||||
}
|
||||
|
||||
if dimension not in dimension_map:
|
||||
raise ValueError(f"Invalid dimension: {dimension}")
|
||||
|
||||
col, value_map = dimension_map[dimension][0], dimension_map[dimension][1]
|
||||
|
||||
if dimension in ['children', 'pet']:
|
||||
threshold_fn = dimension_map[dimension][2]
|
||||
other_label = dimension_map[dimension][3]
|
||||
|
||||
groups = []
|
||||
for val in [0]:
|
||||
group_df = df[df[col] == val]
|
||||
if len(group_df) > 0:
|
||||
groups.append({
|
||||
'name': value_map.get(val, str(val)),
|
||||
'value': val,
|
||||
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
||||
'count': len(group_df),
|
||||
'percentage': round(len(group_df) / len(df) * 100, 1)
|
||||
})
|
||||
|
||||
group_df = df[df[col].apply(threshold_fn)]
|
||||
if len(group_df) > 0:
|
||||
groups.append({
|
||||
'name': other_label,
|
||||
'value': 1,
|
||||
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
||||
'count': len(group_df),
|
||||
'percentage': round(len(group_df) / len(df) * 100, 1)
|
||||
})
|
||||
else:
|
||||
groups = []
|
||||
for val in sorted(df[col].unique()):
|
||||
group_df = df[df[col] == val]
|
||||
if len(group_df) > 0:
|
||||
groups.append({
|
||||
'name': value_map.get(val, str(val)),
|
||||
'value': int(val),
|
||||
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
||||
'count': len(group_df),
|
||||
'percentage': round(len(group_df) / len(df) * 100, 1)
|
||||
})
|
||||
|
||||
if len(groups) >= 2:
|
||||
diff_value = abs(groups[0]['avg_hours'] - groups[1]['avg_hours'])
|
||||
base = min(groups[0]['avg_hours'], groups[1]['avg_hours'])
|
||||
diff_percentage = round(diff_value / base * 100, 1) if base > 0 else 0
|
||||
else:
|
||||
diff_value = 0
|
||||
diff_percentage = 0
|
||||
|
||||
|
||||
column, value_map, dimension_name = dimension_map[dimension]
|
||||
groups = []
|
||||
for value in sorted(df[column].unique()):
|
||||
group_df = df[df[column] == value]
|
||||
groups.append({
|
||||
'name': value_map.get(value, value) if value_map else str(value),
|
||||
'value': int(value) if isinstance(value, (int, np.integer)) else str(value),
|
||||
'avg_hours': round(group_df[config.TARGET_COLUMN].mean(), 2),
|
||||
'count': int(len(group_df)),
|
||||
'percentage': round(len(group_df) / len(df) * 100, 1),
|
||||
})
|
||||
|
||||
groups.sort(key=lambda item: item['avg_hours'], reverse=True)
|
||||
top = groups[0]['avg_hours'] if groups else 0
|
||||
bottom = groups[-1]['avg_hours'] if len(groups) > 1 else 0
|
||||
diff_value = round(top - bottom, 2)
|
||||
diff_percentage = round(diff_value / bottom * 100, 1) if bottom else 0
|
||||
|
||||
return {
|
||||
'dimension': dimension,
|
||||
'dimension_name': {
|
||||
'drinker': '饮酒习惯',
|
||||
'smoker': '吸烟习惯',
|
||||
'education': '学历',
|
||||
'children': '子女',
|
||||
'pet': '宠物'
|
||||
}.get(dimension, dimension),
|
||||
'dimension_name': dimension_name,
|
||||
'groups': groups,
|
||||
'difference': {
|
||||
'value': diff_value,
|
||||
'percentage': diff_percentage
|
||||
}
|
||||
'percentage': diff_percentage,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Correlation matrix:")
|
||||
corr = get_correlation_for_heatmap()
|
||||
print(corr)
|
||||
|
||||
print("\nGroup comparison (drinker):")
|
||||
comp = group_comparison('drinker')
|
||||
print(comp)
|
||||
|
||||
336
backend/core/generate_dataset.py
Normal file
336
backend/core/generate_dataset.py
Normal file
@@ -0,0 +1,336 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import config
|
||||
|
||||
|
||||
INDUSTRIES = {
|
||||
'制造业': {'shift_bias': 0.9, 'overtime_bias': 0.8, 'night_bias': 0.8},
|
||||
'互联网': {'shift_bias': 0.2, 'overtime_bias': 1.0, 'night_bias': 0.2},
|
||||
'零售连锁': {'shift_bias': 0.7, 'overtime_bias': 0.5, 'night_bias': 0.3},
|
||||
'物流运输': {'shift_bias': 0.9, 'overtime_bias': 0.7, 'night_bias': 0.9},
|
||||
'金融服务': {'shift_bias': 0.1, 'overtime_bias': 0.7, 'night_bias': 0.1},
|
||||
'医药健康': {'shift_bias': 0.6, 'overtime_bias': 0.6, 'night_bias': 0.5},
|
||||
'建筑工程': {'shift_bias': 0.5, 'overtime_bias': 0.8, 'night_bias': 0.3},
|
||||
}
|
||||
|
||||
|
||||
def season_from_month(month):
|
||||
if month in [12, 1, 2]:
|
||||
return 1
|
||||
if month in [3, 4, 5]:
|
||||
return 2
|
||||
if month in [6, 7, 8]:
|
||||
return 3
|
||||
return 4
|
||||
|
||||
|
||||
def weighted_choice(rng, items, probs):
|
||||
probs = np.array(probs, dtype=float)
|
||||
probs = probs / probs.sum()
|
||||
return rng.choice(items, p=probs)
|
||||
|
||||
|
||||
def build_company_pool(rng, company_count=180):
|
||||
industries = list(INDUSTRIES.keys())
|
||||
scales = ['100人以下', '100-499人', '500-999人', '1000-4999人', '5000人及以上']
|
||||
city_tiers = ['一线', '新一线', '二线', '三线及以下']
|
||||
companies = []
|
||||
for idx in range(company_count):
|
||||
industry = weighted_choice(rng, industries, [0.22, 0.14, 0.14, 0.14, 0.1, 0.12, 0.14])
|
||||
companies.append({
|
||||
'企业编号': f'C{idx + 1:03d}',
|
||||
'所属行业': industry,
|
||||
'企业规模': weighted_choice(rng, scales, [0.15, 0.28, 0.2, 0.24, 0.13]),
|
||||
'所在城市等级': weighted_choice(rng, city_tiers, [0.18, 0.34, 0.3, 0.18]),
|
||||
})
|
||||
return companies
|
||||
|
||||
|
||||
def build_employee_pool(rng, companies, employee_count=2600):
|
||||
genders = ['男', '女']
|
||||
employment_types = ['正式员工', '劳务派遣', '外包驻场', '实习生']
|
||||
departments = ['生产', '研发', '销售', '客服', '职能', '仓储物流', '门店运营']
|
||||
job_families = ['管理', '专业技术', '销售业务', '生产操作', '行政支持', '客服坐席']
|
||||
job_levels = ['初级', '中级', '高级', '主管', '经理及以上']
|
||||
educations = ['中专及以下', '大专', '本科', '硕士', '博士']
|
||||
marital = ['未婚', '已婚', '离异/其他']
|
||||
housing = ['自有住房', '租房', '宿舍']
|
||||
shifts = ['标准白班', '两班倒', '三班倒', '弹性班']
|
||||
performance = ['A', 'B', 'C', 'D']
|
||||
stress = ['低', '中', '高']
|
||||
|
||||
employees = []
|
||||
for idx in range(employee_count):
|
||||
company = companies[rng.integers(0, len(companies))]
|
||||
industry = company['所属行业']
|
||||
age = int(np.clip(rng.normal(33, 7), 20, 55))
|
||||
tenure = round(float(np.clip(age - 21 + rng.normal(0, 2), 0.2, 32)), 1)
|
||||
family_bias = 0.6 if age >= 30 else 0.25
|
||||
married = weighted_choice(rng, marital, [0.45, 0.48, 0.07] if age < 30 else [0.18, 0.72, 0.1])
|
||||
children = int(np.clip(rng.poisson(0.4 if married == '未婚' else family_bias), 0, 3))
|
||||
industry_profile = INDUSTRIES[industry]
|
||||
shift = weighted_choice(
|
||||
rng,
|
||||
shifts,
|
||||
[
|
||||
max(0.1, 1 - industry_profile['shift_bias']),
|
||||
0.35 * industry_profile['shift_bias'],
|
||||
0.25 * industry_profile['shift_bias'],
|
||||
0.2,
|
||||
],
|
||||
)
|
||||
night_flag = int(shift == '三班倒' or (shift == '两班倒' and rng.random() < industry_profile['night_bias']))
|
||||
overtime = float(np.clip(rng.normal(22 + 18 * industry_profile['overtime_bias'], 10), 0, 90))
|
||||
commute_minutes = float(np.clip(rng.normal(42, 18), 8, 130))
|
||||
commute_km = float(np.clip(commute_minutes * rng.uniform(0.35, 0.75), 2, 65))
|
||||
performance_level = weighted_choice(rng, performance, [0.18, 0.46, 0.26, 0.1])
|
||||
chronic_flag = int(rng.random() < max(0.05, (age - 26) * 0.01))
|
||||
check_abnormal = int(chronic_flag == 1 or rng.random() < 0.14)
|
||||
sleep_hours = round(float(np.clip(rng.normal(6.9 - 0.35 * night_flag, 0.8), 4.5, 9.0)), 1)
|
||||
exercise = int(np.clip(rng.poisson(2.2), 0, 7))
|
||||
smoking = int(rng.random() < (0.22 if rng.random() < 0.55 else 0.08))
|
||||
drinking = int(rng.random() < 0.27)
|
||||
stress_level = weighted_choice(
|
||||
rng,
|
||||
stress,
|
||||
[0.22, 0.52, 0.26 + min(0.15, overtime / 120)],
|
||||
)
|
||||
bmi = round(float(np.clip(rng.normal(24.2, 3.2), 17.5, 36.5)), 1)
|
||||
history_count = int(np.clip(rng.poisson(1.2 + chronic_flag * 0.6 + children * 0.15), 0, 8))
|
||||
history_hours = float(np.clip(rng.normal(18 + chronic_flag * 10 + history_count * 3, 10), 0, 120))
|
||||
discipline = int(np.clip(rng.poisson(0.2), 0, 4))
|
||||
team_size = int(np.clip(rng.normal(11, 5), 3, 40))
|
||||
manager_span = int(np.clip(team_size + rng.normal(3, 2), 4, 60))
|
||||
local_hukou = int(rng.random() < 0.58)
|
||||
cross_city = int(commute_minutes > 65 or (local_hukou == 0 and rng.random() < 0.35))
|
||||
sedentary = int(weighted_choice(rng, [0, 1], [0.45, 0.55]) if company['所属行业'] in ['互联网', '金融服务'] else rng.random() < 0.3)
|
||||
|
||||
employees.append({
|
||||
'企业编号': company['企业编号'],
|
||||
'所属行业': industry,
|
||||
'企业规模': company['企业规模'],
|
||||
'所在城市等级': company['所在城市等级'],
|
||||
'用工类型': weighted_choice(rng, employment_types, [0.74, 0.12, 0.1, 0.04]),
|
||||
'部门条线': weighted_choice(rng, departments, [0.18, 0.16, 0.14, 0.11, 0.12, 0.14, 0.15]),
|
||||
'岗位序列': weighted_choice(rng, job_families, [0.08, 0.24, 0.16, 0.2, 0.12, 0.2]),
|
||||
'岗位级别': weighted_choice(rng, job_levels, [0.34, 0.32, 0.18, 0.11, 0.05]),
|
||||
'员工编号': f'E{idx + 1:05d}',
|
||||
'性别': weighted_choice(rng, genders, [0.56, 0.44]),
|
||||
'年龄': age,
|
||||
'司龄年数': tenure,
|
||||
'最高学历': weighted_choice(rng, educations, [0.14, 0.28, 0.4, 0.15, 0.03]),
|
||||
'婚姻状态': married,
|
||||
'是否本地户籍': local_hukou,
|
||||
'子女数量': children,
|
||||
'是否独生子女家庭负担': int(children >= 2 or (married == '已婚' and rng.random() < 0.18)),
|
||||
'居住类型': weighted_choice(rng, housing, [0.38, 0.48, 0.14]),
|
||||
'班次类型': shift,
|
||||
'是否夜班岗位': night_flag,
|
||||
'月均加班时长': round(overtime, 1),
|
||||
'近30天出勤天数': int(np.clip(rng.normal(21.5, 2.2), 14, 27)),
|
||||
'近90天缺勤次数': history_count,
|
||||
'近180天请假总时长': round(history_hours, 1),
|
||||
'通勤时长分钟': round(commute_minutes, 1),
|
||||
'通勤距离公里': round(commute_km, 1),
|
||||
'是否跨城通勤': cross_city,
|
||||
'绩效等级': performance_level,
|
||||
'近12月违纪次数': discipline,
|
||||
'团队人数': team_size,
|
||||
'直属上级管理跨度': manager_span,
|
||||
'BMI': bmi,
|
||||
'是否慢性病史': chronic_flag,
|
||||
'年度体检异常标记': check_abnormal,
|
||||
'近30天睡眠时长均值': sleep_hours,
|
||||
'每周运动频次': exercise,
|
||||
'是否吸烟': smoking,
|
||||
'是否饮酒': drinking,
|
||||
'心理压力等级': stress_level,
|
||||
'是否长期久坐岗位': sedentary,
|
||||
})
|
||||
return employees
|
||||
|
||||
|
||||
def sample_event(rng, employee):
|
||||
month = int(rng.integers(1, 13))
|
||||
weekday = int(rng.integers(1, 8))
|
||||
near_holiday = int(rng.random() < (0.3 if month in [1, 2, 4, 5, 9, 10] else 0.16))
|
||||
leave_type_items = ['病假', '事假', '年假', '调休', '婚假', '丧假', '产检育儿假', '工伤假', '其他']
|
||||
leave_type = weighted_choice(rng, leave_type_items, [0.3, 0.22, 0.12, 0.14, 0.03, 0.02, 0.06, 0.02, 0.09])
|
||||
if employee['子女数量'] > 0 and rng.random() < 0.14:
|
||||
reason_category = '子女照护'
|
||||
else:
|
||||
reason_category = weighted_choice(
|
||||
rng,
|
||||
['身体不适', '家庭事务', '交通受阻', '突发事件', '职业疲劳', '就医复查'],
|
||||
[0.28, 0.19, 0.09, 0.11, 0.2, 0.13],
|
||||
)
|
||||
medical_certificate = int(leave_type in ['病假', '工伤假'] or reason_category in ['身体不适', '就医复查'])
|
||||
urgent_leave = int(rng.random() < (0.45 if leave_type in ['病假', '事假', '工伤假'] else 0.18))
|
||||
continuous_absence = int(rng.random() < (0.2 if leave_type in ['病假', '产检育儿假', '工伤假'] else 0.08))
|
||||
previous_overtime = int(rng.random() < min(0.85, employee['月均加班时长'] / 65))
|
||||
season = season_from_month(month)
|
||||
channel = weighted_choice(rng, ['系统申请', '主管代提', '临时电话报备'], [0.68, 0.18, 0.14])
|
||||
|
||||
base = 0.95
|
||||
base += min(employee['月均加班时长'] / 28, 1.8)
|
||||
base += min(employee['通勤时长分钟'] / 65, 1.2)
|
||||
base += employee['是否夜班岗位'] * 0.9
|
||||
base += employee['是否慢性病史'] * 1.25
|
||||
base += employee['年度体检异常标记'] * 0.6
|
||||
base += 0.35 * employee['子女数量']
|
||||
base += 0.5 if employee['心理压力等级'] == '高' else (0.2 if employee['心理压力等级'] == '中' else -0.1)
|
||||
base += 0.4 if employee['是否跨城通勤'] else 0
|
||||
base += 0.35 if previous_overtime else 0
|
||||
base += 0.35 if near_holiday else 0
|
||||
base += 0.3 if continuous_absence else 0
|
||||
base += 0.3 if employee['近90天缺勤次数'] >= 3 else 0
|
||||
base -= 0.35 if employee['绩效等级'] == 'A' else (0.15 if employee['绩效等级'] == 'B' else 0)
|
||||
base -= min(employee['司龄年数'] / 40, 0.5)
|
||||
base -= min(employee['每周运动频次'] * 0.08, 0.3)
|
||||
base -= 0.2 if employee['近30天睡眠时长均值'] >= 7.5 else 0
|
||||
|
||||
leave_bonus = {
|
||||
'病假': 2.0,
|
||||
'事假': 0.8,
|
||||
'年假': 0.1,
|
||||
'调休': 0.1,
|
||||
'婚假': 3.0,
|
||||
'丧假': 2.8,
|
||||
'产检育儿假': 2.4,
|
||||
'工伤假': 3.8,
|
||||
'其他': 0.5,
|
||||
}
|
||||
reason_bonus = {
|
||||
'身体不适': 1.0,
|
||||
'家庭事务': 0.5,
|
||||
'子女照护': 0.8,
|
||||
'交通受阻': 0.2,
|
||||
'突发事件': 0.6,
|
||||
'职业疲劳': 0.7,
|
||||
'就医复查': 1.2,
|
||||
}
|
||||
industry_bonus = {
|
||||
'制造业': 0.35,
|
||||
'互联网': 0.2,
|
||||
'零售连锁': 0.25,
|
||||
'物流运输': 0.4,
|
||||
'金融服务': 0.1,
|
||||
'医药健康': 0.2,
|
||||
'建筑工程': 0.35,
|
||||
}
|
||||
season_bonus = {1: 0.35, 2: 0.0, 3: 0.15, 4: 0.05}
|
||||
weekday_bonus = {1: 0.05, 2: 0.0, 3: 0.0, 4: 0.05, 5: 0.15, 6: 0.25, 7: 0.3}
|
||||
|
||||
duration = base
|
||||
duration += leave_bonus[leave_type]
|
||||
duration += reason_bonus[reason_category]
|
||||
duration += industry_bonus[employee['所属行业']]
|
||||
duration += season_bonus[season]
|
||||
duration += weekday_bonus[weekday]
|
||||
duration += 0.55 if medical_certificate else 0
|
||||
duration += 0.4 if urgent_leave else -0.05
|
||||
duration += rng.normal(0, 0.9)
|
||||
|
||||
if leave_type in ['婚假', '丧假', '工伤假'] and rng.random() < 0.5:
|
||||
duration += rng.uniform(1.5, 5)
|
||||
if leave_type == '病假' and employee['是否慢性病史'] == 1 and rng.random() < 0.35:
|
||||
duration += rng.uniform(1, 4)
|
||||
if leave_type in ['年假', '调休']:
|
||||
duration *= rng.uniform(0.7, 0.95)
|
||||
|
||||
duration = round(float(np.clip(duration, 0.5, 24.0)), 1)
|
||||
|
||||
event = employee.copy()
|
||||
event.update({
|
||||
'缺勤月份': month,
|
||||
'星期几': weekday,
|
||||
'是否节假日前后': near_holiday,
|
||||
'季节': season,
|
||||
'请假申请渠道': channel,
|
||||
'请假类型': leave_type,
|
||||
'请假原因大类': reason_category,
|
||||
'是否提供医院证明': medical_certificate,
|
||||
'是否临时请假': urgent_leave,
|
||||
'是否连续缺勤': continuous_absence,
|
||||
'前一工作日是否加班': previous_overtime,
|
||||
'缺勤时长(小时)': duration,
|
||||
})
|
||||
return event
|
||||
|
||||
|
||||
def validate_dataset(df):
|
||||
required_columns = [
|
||||
'员工编号',
|
||||
'所属行业',
|
||||
'岗位序列',
|
||||
'月均加班时长',
|
||||
'通勤时长分钟',
|
||||
'是否慢性病史',
|
||||
'请假类型',
|
||||
'缺勤时长(小时)',
|
||||
]
|
||||
for column in required_columns:
|
||||
if column not in df.columns:
|
||||
raise ValueError(f'Missing required column: {column}')
|
||||
|
||||
if len(df) < 10000:
|
||||
raise ValueError('Synthetic dataset is smaller than expected')
|
||||
if df['员工编号'].nunique() < 2000:
|
||||
raise ValueError('Employee coverage is too small')
|
||||
|
||||
high_risk_ratio = (df['缺勤时长(小时)'] > 8).mean()
|
||||
if not 0.15 <= high_risk_ratio <= 0.4:
|
||||
raise ValueError(f'High risk ratio out of range: {high_risk_ratio:.3f}')
|
||||
|
||||
medical_mean = df[df['是否提供医院证明'] == 1]['缺勤时长(小时)'].mean()
|
||||
no_medical_mean = df[df['是否提供医院证明'] == 0]['缺勤时长(小时)'].mean()
|
||||
if medical_mean <= no_medical_mean:
|
||||
raise ValueError('Medical certificate signal is not effective')
|
||||
|
||||
night_mean = df[df['是否夜班岗位'] == 1]['缺勤时长(小时)'].mean()
|
||||
day_mean = df[df['是否夜班岗位'] == 0]['缺勤时长(小时)'].mean()
|
||||
if night_mean <= day_mean:
|
||||
raise ValueError('Night shift signal is not effective')
|
||||
|
||||
|
||||
def generate_dataset(output_path=None, sample_count=12000, random_state=None):
|
||||
rng = np.random.default_rng(config.RANDOM_STATE if random_state is None else random_state)
|
||||
companies = build_company_pool(rng)
|
||||
employees = build_employee_pool(rng, companies)
|
||||
|
||||
events = []
|
||||
employee_idx = rng.integers(0, len(employees), size=sample_count)
|
||||
for idx in employee_idx:
|
||||
events.append(sample_event(rng, employees[int(idx)]))
|
||||
|
||||
df = pd.DataFrame(events)
|
||||
validate_dataset(df)
|
||||
|
||||
if output_path:
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
df.to_csv(output_path, index=False, encoding='utf-8-sig')
|
||||
return df
|
||||
|
||||
|
||||
def ensure_dataset():
|
||||
if not os.path.exists(config.RAW_DATA_PATH):
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
return
|
||||
|
||||
try:
|
||||
df = pd.read_csv(config.RAW_DATA_PATH)
|
||||
validate_dataset(df)
|
||||
except Exception:
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
dataset = generate_dataset(config.RAW_DATA_PATH)
|
||||
print(f'Generated dataset: {config.RAW_DATA_PATH}')
|
||||
print(dataset.head())
|
||||
326
backend/core/model_features.py
Normal file
326
backend/core/model_features.py
Normal file
@@ -0,0 +1,326 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
import config
|
||||
|
||||
|
||||
TARGET_COLUMN = config.TARGET_COLUMN
|
||||
ID_COLUMN = config.EMPLOYEE_ID_COLUMN
|
||||
COMPANY_COLUMN = config.COMPANY_ID_COLUMN
|
||||
LEAKY_COLUMNS = [ID_COLUMN, COMPANY_COLUMN]
|
||||
ORDINAL_COLUMNS = [
|
||||
'企业规模',
|
||||
'所在城市等级',
|
||||
'岗位级别',
|
||||
'最高学历',
|
||||
'绩效等级',
|
||||
'心理压力等级',
|
||||
'工龄分层',
|
||||
'年龄分层',
|
||||
'通勤分层',
|
||||
'加班分层',
|
||||
]
|
||||
NUMERICAL_OUTLIER_COLUMNS = [
|
||||
'年龄',
|
||||
'司龄年数',
|
||||
'月均加班时长',
|
||||
'近30天出勤天数',
|
||||
'近90天缺勤次数',
|
||||
'近180天请假总时长',
|
||||
'通勤时长分钟',
|
||||
'通勤距离公里',
|
||||
'团队人数',
|
||||
'直属上级管理跨度',
|
||||
'BMI',
|
||||
'近30天睡眠时长均值',
|
||||
'每周运动频次',
|
||||
]
|
||||
DEFAULT_PREDICTION_INPUT = {
|
||||
'industry': '制造业',
|
||||
'company_size': '1000-4999人',
|
||||
'city_tier': '新一线',
|
||||
'age': 31,
|
||||
'tenure_years': 4.5,
|
||||
'education_level': '本科',
|
||||
'marital_status': '已婚',
|
||||
'job_family': '专业技术',
|
||||
'job_level': '中级',
|
||||
'employment_type': '正式员工',
|
||||
'shift_type': '标准白班',
|
||||
'is_night_shift': 0,
|
||||
'monthly_overtime_hours': 26,
|
||||
'attendance_days_30d': 22,
|
||||
'absence_count_90d': 1,
|
||||
'leave_hours_180d': 18,
|
||||
'commute_minutes': 42,
|
||||
'commute_km': 18,
|
||||
'cross_city_commute': 0,
|
||||
'performance_level': 'B',
|
||||
'disciplinary_count_12m': 0,
|
||||
'team_size': 10,
|
||||
'manager_span': 14,
|
||||
'bmi': 24.5,
|
||||
'chronic_disease_flag': 0,
|
||||
'annual_check_abnormal_flag': 0,
|
||||
'sleep_hours': 7.1,
|
||||
'exercise_frequency': 2,
|
||||
'smoking_flag': 0,
|
||||
'drinking_flag': 0,
|
||||
'stress_level': '中',
|
||||
'sedentary_job_flag': 1,
|
||||
'local_hukou_flag': 1,
|
||||
'children_count': 1,
|
||||
'single_child_burden_flag': 0,
|
||||
'absence_month': 5,
|
||||
'weekday': 2,
|
||||
'near_holiday_flag': 0,
|
||||
'leave_channel': '系统申请',
|
||||
'leave_type': '病假',
|
||||
'leave_reason_category': '身体不适',
|
||||
'medical_certificate_flag': 1,
|
||||
'urgent_leave_flag': 1,
|
||||
'continuous_absence_flag': 0,
|
||||
'previous_day_overtime_flag': 1,
|
||||
}
|
||||
|
||||
|
||||
def make_target_bins(y):
|
||||
y_series = pd.Series(y)
|
||||
bins = pd.cut(
|
||||
y_series,
|
||||
bins=[0, 4, 8, 12, np.inf],
|
||||
labels=['low', 'medium', 'high', 'extreme'],
|
||||
include_lowest=True,
|
||||
)
|
||||
return bins.astype(str)
|
||||
|
||||
|
||||
def normalize_columns(df):
|
||||
df = df.copy()
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
return df
|
||||
|
||||
|
||||
def prepare_modeling_dataframe(df):
|
||||
df = normalize_columns(df)
|
||||
drop_cols = [col for col in LEAKY_COLUMNS if col in df.columns]
|
||||
if drop_cols:
|
||||
df = df.drop(columns=drop_cols)
|
||||
return df
|
||||
|
||||
|
||||
def fit_outlier_bounds(df, columns, lower_pct=1, upper_pct=99):
|
||||
bounds = {}
|
||||
for col in columns:
|
||||
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
|
||||
bounds[col] = (
|
||||
float(df[col].quantile(lower_pct / 100)),
|
||||
float(df[col].quantile(upper_pct / 100)),
|
||||
)
|
||||
return bounds
|
||||
|
||||
|
||||
def apply_outlier_bounds(df, bounds):
|
||||
df = df.copy()
|
||||
for col, (lower, upper) in bounds.items():
|
||||
if col in df.columns:
|
||||
df[col] = df[col].clip(lower, upper)
|
||||
return df
|
||||
|
||||
|
||||
def engineer_features(df):
|
||||
df = df.copy()
|
||||
df['加班通勤压力指数'] = (
|
||||
df['月均加班时长'] * 0.45
|
||||
+ df['通勤时长分钟'] * 0.35
|
||||
+ df['是否夜班岗位'] * 12
|
||||
+ df['前一工作日是否加班'] * 6
|
||||
) / 10
|
||||
df['家庭负担指数'] = (
|
||||
df['子女数量'] * 1.2
|
||||
+ df['是否独生子女家庭负担'] * 1.5
|
||||
+ (df['婚姻状态'] == '已婚').astype(int) * 0.6
|
||||
)
|
||||
df['健康风险指数'] = (
|
||||
df['是否慢性病史'] * 2
|
||||
+ df['年度体检异常标记'] * 1.2
|
||||
+ (df['BMI'] >= 28).astype(int) * 1.1
|
||||
+ df['是否吸烟'] * 0.8
|
||||
+ df['是否饮酒'] * 0.4
|
||||
+ (df['近30天睡眠时长均值'] < 6.5).astype(int) * 1.2
|
||||
)
|
||||
df['岗位稳定性指数'] = (
|
||||
df['司龄年数'] * 0.3
|
||||
+ (df['绩效等级'] == 'A').astype(int) * 1.2
|
||||
+ (df['绩效等级'] == 'B').astype(int) * 0.8
|
||||
- df['近12月违纪次数'] * 0.7
|
||||
)
|
||||
df['节假日风险标记'] = (
|
||||
(df['是否节假日前后'] == 1) | (df['请假类型'].isin(['事假', '年假', '调休']))
|
||||
).astype(int)
|
||||
df['排班压力标记'] = (
|
||||
(df['班次类型'].isin(['两班倒', '三班倒'])) | (df['是否夜班岗位'] == 1)
|
||||
).astype(int)
|
||||
df['缺勤历史强度'] = df['近90天缺勤次数'] * 1.5 + df['近180天请假总时长'] / 12
|
||||
df['生活规律指数'] = (
|
||||
df['近30天睡眠时长均值'] * 0.6
|
||||
+ df['每周运动频次'] * 0.7
|
||||
- df['是否吸烟'] * 1.1
|
||||
- df['是否饮酒'] * 0.5
|
||||
)
|
||||
df['管理负荷指数'] = df['团队人数'] * 0.4 + df['直属上级管理跨度'] * 0.25
|
||||
|
||||
df['工龄分层'] = pd.cut(df['司龄年数'], bins=[0, 2, 5, 10, 40], labels=['1', '2', '3', '4'])
|
||||
df['年龄分层'] = pd.cut(df['年龄'], bins=[18, 25, 32, 40, 60], labels=['1', '2', '3', '4'])
|
||||
df['通勤分层'] = pd.cut(df['通勤时长分钟'], bins=[0, 25, 45, 70, 180], labels=['1', '2', '3', '4'])
|
||||
df['加班分层'] = pd.cut(df['月均加班时长'], bins=[-1, 10, 25, 45, 120], labels=['1', '2', '3', '4'])
|
||||
return df
|
||||
|
||||
|
||||
def fit_label_encoders(df, ordinal_columns=None):
|
||||
ordinal_columns = ordinal_columns or ORDINAL_COLUMNS
|
||||
df = df.copy()
|
||||
encoders = {}
|
||||
object_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
encode_columns = sorted(set(object_columns + [col for col in ordinal_columns if col in df.columns]))
|
||||
for col in encode_columns:
|
||||
encoder = LabelEncoder()
|
||||
df[col] = encoder.fit_transform(df[col].astype(str))
|
||||
encoders[col] = encoder
|
||||
return df, encoders
|
||||
|
||||
|
||||
def apply_label_encoders(df, encoders):
|
||||
df = df.copy()
|
||||
for col, encoder in encoders.items():
|
||||
if col not in df.columns:
|
||||
continue
|
||||
value_map = {cls: idx for idx, cls in enumerate(encoder.classes_)}
|
||||
df[col] = df[col].astype(str).map(lambda value: value_map.get(value, 0))
|
||||
return df
|
||||
|
||||
|
||||
def extract_xy(df):
|
||||
y = df[TARGET_COLUMN].values if TARGET_COLUMN in df.columns else None
|
||||
X_df = df.drop(columns=[TARGET_COLUMN]) if TARGET_COLUMN in df.columns else df.copy()
|
||||
return X_df, y
|
||||
|
||||
|
||||
def build_prediction_dataframe(data):
|
||||
feature_row = {
|
||||
'企业编号': 'PREDICT_COMPANY',
|
||||
'所属行业': data.get('industry', DEFAULT_PREDICTION_INPUT['industry']),
|
||||
'企业规模': data.get('company_size', DEFAULT_PREDICTION_INPUT['company_size']),
|
||||
'所在城市等级': data.get('city_tier', DEFAULT_PREDICTION_INPUT['city_tier']),
|
||||
'用工类型': data.get('employment_type', DEFAULT_PREDICTION_INPUT['employment_type']),
|
||||
'部门条线': data.get('department_line', '研发'),
|
||||
'岗位序列': data.get('job_family', DEFAULT_PREDICTION_INPUT['job_family']),
|
||||
'岗位级别': data.get('job_level', DEFAULT_PREDICTION_INPUT['job_level']),
|
||||
'员工编号': 'PREDICT_EMPLOYEE',
|
||||
'性别': data.get('gender', '男'),
|
||||
'年龄': data.get('age', DEFAULT_PREDICTION_INPUT['age']),
|
||||
'司龄年数': data.get('tenure_years', DEFAULT_PREDICTION_INPUT['tenure_years']),
|
||||
'最高学历': data.get('education_level', DEFAULT_PREDICTION_INPUT['education_level']),
|
||||
'婚姻状态': data.get('marital_status', DEFAULT_PREDICTION_INPUT['marital_status']),
|
||||
'是否本地户籍': data.get('local_hukou_flag', DEFAULT_PREDICTION_INPUT['local_hukou_flag']),
|
||||
'子女数量': data.get('children_count', DEFAULT_PREDICTION_INPUT['children_count']),
|
||||
'是否独生子女家庭负担': data.get(
|
||||
'single_child_burden_flag',
|
||||
DEFAULT_PREDICTION_INPUT['single_child_burden_flag'],
|
||||
),
|
||||
'居住类型': data.get('housing_type', '租房'),
|
||||
'班次类型': data.get('shift_type', DEFAULT_PREDICTION_INPUT['shift_type']),
|
||||
'是否夜班岗位': data.get('is_night_shift', DEFAULT_PREDICTION_INPUT['is_night_shift']),
|
||||
'月均加班时长': data.get(
|
||||
'monthly_overtime_hours',
|
||||
DEFAULT_PREDICTION_INPUT['monthly_overtime_hours'],
|
||||
),
|
||||
'近30天出勤天数': data.get(
|
||||
'attendance_days_30d',
|
||||
DEFAULT_PREDICTION_INPUT['attendance_days_30d'],
|
||||
),
|
||||
'近90天缺勤次数': data.get('absence_count_90d', DEFAULT_PREDICTION_INPUT['absence_count_90d']),
|
||||
'近180天请假总时长': data.get('leave_hours_180d', DEFAULT_PREDICTION_INPUT['leave_hours_180d']),
|
||||
'通勤时长分钟': data.get('commute_minutes', DEFAULT_PREDICTION_INPUT['commute_minutes']),
|
||||
'通勤距离公里': data.get('commute_km', DEFAULT_PREDICTION_INPUT['commute_km']),
|
||||
'是否跨城通勤': data.get(
|
||||
'cross_city_commute',
|
||||
DEFAULT_PREDICTION_INPUT['cross_city_commute'],
|
||||
),
|
||||
'绩效等级': data.get('performance_level', DEFAULT_PREDICTION_INPUT['performance_level']),
|
||||
'近12月违纪次数': data.get(
|
||||
'disciplinary_count_12m',
|
||||
DEFAULT_PREDICTION_INPUT['disciplinary_count_12m'],
|
||||
),
|
||||
'团队人数': data.get('team_size', DEFAULT_PREDICTION_INPUT['team_size']),
|
||||
'直属上级管理跨度': data.get('manager_span', DEFAULT_PREDICTION_INPUT['manager_span']),
|
||||
'BMI': data.get('bmi', DEFAULT_PREDICTION_INPUT['bmi']),
|
||||
'是否慢性病史': data.get(
|
||||
'chronic_disease_flag',
|
||||
DEFAULT_PREDICTION_INPUT['chronic_disease_flag'],
|
||||
),
|
||||
'年度体检异常标记': data.get(
|
||||
'annual_check_abnormal_flag',
|
||||
DEFAULT_PREDICTION_INPUT['annual_check_abnormal_flag'],
|
||||
),
|
||||
'近30天睡眠时长均值': data.get('sleep_hours', DEFAULT_PREDICTION_INPUT['sleep_hours']),
|
||||
'每周运动频次': data.get(
|
||||
'exercise_frequency',
|
||||
DEFAULT_PREDICTION_INPUT['exercise_frequency'],
|
||||
),
|
||||
'是否吸烟': data.get('smoking_flag', DEFAULT_PREDICTION_INPUT['smoking_flag']),
|
||||
'是否饮酒': data.get('drinking_flag', DEFAULT_PREDICTION_INPUT['drinking_flag']),
|
||||
'心理压力等级': data.get('stress_level', DEFAULT_PREDICTION_INPUT['stress_level']),
|
||||
'是否长期久坐岗位': data.get(
|
||||
'sedentary_job_flag',
|
||||
DEFAULT_PREDICTION_INPUT['sedentary_job_flag'],
|
||||
),
|
||||
'缺勤月份': data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month']),
|
||||
'星期几': data.get('weekday', DEFAULT_PREDICTION_INPUT['weekday']),
|
||||
'是否节假日前后': data.get('near_holiday_flag', DEFAULT_PREDICTION_INPUT['near_holiday_flag']),
|
||||
'季节': _season_from_month(data.get('absence_month', DEFAULT_PREDICTION_INPUT['absence_month'])),
|
||||
'请假申请渠道': data.get('leave_channel', DEFAULT_PREDICTION_INPUT['leave_channel']),
|
||||
'请假类型': data.get('leave_type', DEFAULT_PREDICTION_INPUT['leave_type']),
|
||||
'请假原因大类': data.get(
|
||||
'leave_reason_category',
|
||||
DEFAULT_PREDICTION_INPUT['leave_reason_category'],
|
||||
),
|
||||
'是否提供医院证明': data.get(
|
||||
'medical_certificate_flag',
|
||||
DEFAULT_PREDICTION_INPUT['medical_certificate_flag'],
|
||||
),
|
||||
'是否临时请假': data.get('urgent_leave_flag', DEFAULT_PREDICTION_INPUT['urgent_leave_flag']),
|
||||
'是否连续缺勤': data.get(
|
||||
'continuous_absence_flag',
|
||||
DEFAULT_PREDICTION_INPUT['continuous_absence_flag'],
|
||||
),
|
||||
'前一工作日是否加班': data.get(
|
||||
'previous_day_overtime_flag',
|
||||
DEFAULT_PREDICTION_INPUT['previous_day_overtime_flag'],
|
||||
),
|
||||
}
|
||||
return pd.DataFrame([feature_row])
|
||||
|
||||
|
||||
def _season_from_month(month):
|
||||
month = int(month)
|
||||
if month in [12, 1, 2]:
|
||||
return 1
|
||||
if month in [3, 4, 5]:
|
||||
return 2
|
||||
if month in [6, 7, 8]:
|
||||
return 3
|
||||
return 4
|
||||
|
||||
|
||||
def align_feature_frame(df, feature_names):
|
||||
aligned = df.copy()
|
||||
for feature in feature_names:
|
||||
if feature not in aligned.columns:
|
||||
aligned[feature] = 0
|
||||
return aligned[feature_names]
|
||||
|
||||
|
||||
def to_float_array(df):
|
||||
return df.values.astype(float)
|
||||
@@ -1,10 +1,11 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import joblib
|
||||
import os
|
||||
|
||||
import joblib
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import config
|
||||
from core.generate_dataset import ensure_dataset
|
||||
|
||||
|
||||
class DataPreprocessor:
|
||||
@@ -12,67 +13,57 @@ class DataPreprocessor:
|
||||
self.scaler = StandardScaler()
|
||||
self.is_fitted = False
|
||||
self.feature_names = None
|
||||
|
||||
|
||||
def load_raw_data(self):
|
||||
ensure_dataset()
|
||||
df = pd.read_csv(config.RAW_DATA_PATH, sep=config.CSV_SEPARATOR)
|
||||
df.columns = df.columns.str.strip()
|
||||
return df
|
||||
|
||||
|
||||
def clean_data(self, df):
|
||||
df = df.copy()
|
||||
|
||||
df = df.drop_duplicates()
|
||||
|
||||
|
||||
for col in df.columns:
|
||||
if df[col].isnull().sum() > 0:
|
||||
if df[col].dtype in ['int64', 'float64']:
|
||||
df[col].fillna(df[col].median(), inplace=True)
|
||||
else:
|
||||
df[col].fillna(df[col].mode()[0], inplace=True)
|
||||
|
||||
if df[col].isnull().sum() == 0:
|
||||
continue
|
||||
if pd.api.types.is_numeric_dtype(df[col]):
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(df[col].mode()[0])
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def fit_transform(self, df):
|
||||
df = self.clean_data(df)
|
||||
|
||||
if 'Absenteeism time in hours' in df.columns:
|
||||
y = df['Absenteeism time in hours'].values
|
||||
feature_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
if config.TARGET_COLUMN in df.columns:
|
||||
y = df[config.TARGET_COLUMN].values
|
||||
feature_df = df.drop(columns=[config.TARGET_COLUMN])
|
||||
else:
|
||||
y = None
|
||||
feature_df = df
|
||||
|
||||
|
||||
self.feature_names = list(feature_df.columns)
|
||||
|
||||
X = feature_df.values
|
||||
|
||||
X = self.scaler.fit_transform(X)
|
||||
|
||||
X = self.scaler.fit_transform(feature_df.values)
|
||||
self.is_fitted = True
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
def transform(self, df):
|
||||
if not self.is_fitted:
|
||||
raise ValueError("Preprocessor has not been fitted yet.")
|
||||
|
||||
|
||||
df = self.clean_data(df)
|
||||
|
||||
if 'Absenteeism time in hours' in df.columns:
|
||||
feature_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
if config.TARGET_COLUMN in df.columns:
|
||||
feature_df = df.drop(columns=[config.TARGET_COLUMN])
|
||||
else:
|
||||
feature_df = df
|
||||
|
||||
X = feature_df.values
|
||||
X = self.scaler.transform(X)
|
||||
|
||||
return X
|
||||
|
||||
return self.scaler.transform(feature_df.values)
|
||||
|
||||
def save_preprocessor(self):
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
joblib.dump(self.scaler, config.SCALER_PATH)
|
||||
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
||||
|
||||
|
||||
def load_preprocessor(self):
|
||||
self.scaler = joblib.load(config.SCALER_PATH)
|
||||
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
||||
@@ -84,22 +75,18 @@ class DataPreprocessor:
|
||||
def get_clean_data():
|
||||
preprocessor = DataPreprocessor()
|
||||
df = preprocessor.load_raw_data()
|
||||
df = preprocessor.clean_data(df)
|
||||
return df
|
||||
return preprocessor.clean_data(df)
|
||||
|
||||
|
||||
def save_clean_data():
|
||||
preprocessor = DataPreprocessor()
|
||||
df = preprocessor.load_raw_data()
|
||||
df = preprocessor.clean_data(df)
|
||||
|
||||
os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
|
||||
df.to_csv(config.CLEAN_DATA_PATH, index=False, sep=',')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = save_clean_data()
|
||||
print(f"Clean data saved. Shape: {df.shape}")
|
||||
print(df.head())
|
||||
data = save_clean_data()
|
||||
print(f"Clean data saved. Shape: {data.shape}")
|
||||
|
||||
@@ -1,123 +1,57 @@
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
from sklearn.ensemble import (
|
||||
RandomForestRegressor,
|
||||
GradientBoostingRegressor,
|
||||
ExtraTreesRegressor,
|
||||
StackingRegressor
|
||||
)
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
||||
from sklearn.preprocessing import RobustScaler, LabelEncoder
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
||||
import xgboost as xgb
|
||||
import lightgbm as lgb
|
||||
import joblib
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
import config
|
||||
from core.model_features import (
|
||||
NUMERICAL_OUTLIER_COLUMNS,
|
||||
ORDINAL_COLUMNS,
|
||||
TARGET_COLUMN,
|
||||
align_feature_frame,
|
||||
apply_label_encoders,
|
||||
apply_outlier_bounds,
|
||||
engineer_features,
|
||||
extract_xy,
|
||||
fit_label_encoders,
|
||||
fit_outlier_bounds,
|
||||
make_target_bins,
|
||||
normalize_columns,
|
||||
prepare_modeling_dataframe,
|
||||
to_float_array,
|
||||
)
|
||||
from core.preprocessing import get_clean_data
|
||||
|
||||
try:
|
||||
import lightgbm as lgb
|
||||
except ImportError:
|
||||
lgb = None
|
||||
|
||||
try:
|
||||
import xgboost as xgb
|
||||
except ImportError:
|
||||
xgb = None
|
||||
|
||||
|
||||
def print_training_log(model_name, start_time, best_score, best_params, n_iter, cv_folds):
|
||||
elapsed = time.time() - start_time
|
||||
print(f" {'─'*50}")
|
||||
print(f" Model: {model_name}")
|
||||
print(f" Time: {elapsed:.1f}s")
|
||||
print(f" Best CV R2: {best_score:.4f}")
|
||||
print(f" Best params:")
|
||||
for k, v in best_params.items():
|
||||
print(f" - {k}: {v}")
|
||||
print(f" Iterations: {n_iter}, CV folds: {cv_folds}")
|
||||
print(f" {'─'*50}")
|
||||
|
||||
|
||||
class DataAugmenter:
|
||||
def __init__(self, noise_level=0.02, n_augment=2):
|
||||
self.noise_level = noise_level
|
||||
self.n_augment = n_augment
|
||||
|
||||
def augment(self, df, target_col='Absenteeism time in hours'):
|
||||
print(f"\nData Augmentation...")
|
||||
print(f" Original size: {len(df)}")
|
||||
|
||||
augmented_dfs = [df]
|
||||
|
||||
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
if target_col in numerical_cols:
|
||||
numerical_cols.remove(target_col)
|
||||
|
||||
for i in range(self.n_augment):
|
||||
df_aug = df.copy()
|
||||
|
||||
for col in numerical_cols:
|
||||
if col in df_aug.columns:
|
||||
std_val = df_aug[col].std()
|
||||
if std_val > 0:
|
||||
noise = np.random.normal(0, self.noise_level * std_val, len(df_aug))
|
||||
df_aug[col] = df_aug[col] + noise
|
||||
|
||||
augmented_dfs.append(df_aug)
|
||||
|
||||
df_result = pd.concat(augmented_dfs, ignore_index=True)
|
||||
print(f" Augmented size: {len(df_result)}")
|
||||
|
||||
return df_result
|
||||
|
||||
def smote_regression(self, df, target_col='Absenteeism time in hours'):
|
||||
df = df.copy()
|
||||
y = df[target_col].values
|
||||
|
||||
bins = [0, 1, 4, 8, 100]
|
||||
labels = ['zero', 'low', 'medium', 'high']
|
||||
df['_target_bin'] = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
|
||||
|
||||
bin_counts = df['_target_bin'].value_counts()
|
||||
max_count = bin_counts.max()
|
||||
|
||||
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
if target_col in numerical_cols:
|
||||
numerical_cols.remove(target_col)
|
||||
if '_target_bin' in numerical_cols:
|
||||
numerical_cols.remove('_target_bin')
|
||||
|
||||
augmented_rows = []
|
||||
for bin_label in labels:
|
||||
bin_df = df[df['_target_bin'] == bin_label].drop(columns=['_target_bin'])
|
||||
bin_size = len(bin_df)
|
||||
|
||||
if bin_size < max_count and bin_size > 0:
|
||||
n_samples_to_add = max_count - bin_size
|
||||
|
||||
for _ in range(n_samples_to_add):
|
||||
idx = np.random.choice(bin_df.index)
|
||||
sample = bin_df.loc[idx].copy()
|
||||
|
||||
for col in numerical_cols:
|
||||
if col in sample.index:
|
||||
std_val = bin_df[col].std()
|
||||
if std_val > 0:
|
||||
noise = np.random.normal(0, 0.02 * std_val)
|
||||
sample[col] = sample[col] + noise
|
||||
|
||||
augmented_rows.append(sample)
|
||||
|
||||
if augmented_rows:
|
||||
df_aug = pd.DataFrame(augmented_rows)
|
||||
df_result = pd.concat([df.drop(columns=['_target_bin']), df_aug], ignore_index=True)
|
||||
else:
|
||||
df_result = df.drop(columns=['_target_bin'])
|
||||
|
||||
print(f" After SMOTE-like augmentation: {len(df_result)}")
|
||||
|
||||
return df_result
|
||||
print(f' {"-" * 50}')
|
||||
print(f' Model: {model_name}')
|
||||
print(f' Time: {elapsed:.1f}s')
|
||||
print(f' Best CV R2: {best_score:.4f}')
|
||||
for key, value in best_params.items():
|
||||
print(f' - {key}: {value}')
|
||||
print(f' Iterations: {n_iter}, CV folds: {cv_folds}')
|
||||
|
||||
|
||||
class OptimizedModelTrainer:
|
||||
@@ -128,461 +62,237 @@ class OptimizedModelTrainer:
|
||||
self.selected_features = None
|
||||
self.label_encoders = {}
|
||||
self.model_metrics = {}
|
||||
self.augmenter = DataAugmenter(noise_level=0.02, n_augment=2)
|
||||
|
||||
self.training_metadata = {}
|
||||
self.feature_selector = None
|
||||
self.outlier_bounds = {}
|
||||
self.feature_k = 22
|
||||
self.target_transform = 'log1p'
|
||||
self.enabled_models = ['random_forest', 'gradient_boosting', 'extra_trees', 'lightgbm', 'xgboost']
|
||||
|
||||
def analyze_data(self, df):
|
||||
print("\n" + "="*60)
|
||||
print("Data Analysis")
|
||||
print("="*60)
|
||||
|
||||
y = df['Absenteeism time in hours']
|
||||
|
||||
print(f"\nTarget variable statistics:")
|
||||
print(f" Min: {y.min()}")
|
||||
print(f" Max: {y.max()}")
|
||||
print(f" Mean: {y.mean():.2f}")
|
||||
print(f" Median: {y.median():.2f}")
|
||||
print(f" Std: {y.std():.2f}")
|
||||
print(f" Skewness: {y.skew():.2f}")
|
||||
|
||||
print(f"\nTarget distribution:")
|
||||
print(f" Zero values: {(y == 0).sum()} ({(y == 0).sum() / len(y) * 100:.1f}%)")
|
||||
print(f" 1-8 hours: {((y > 0) & (y <= 8)).sum()} ({((y > 0) & (y <= 8)).sum() / len(y) * 100:.1f}%)")
|
||||
print(f" >8 hours: {(y > 8).sum()} ({(y > 8).sum() / len(y) * 100:.1f}%)")
|
||||
|
||||
return y
|
||||
|
||||
def clip_outliers(self, df, columns, lower_pct=1, upper_pct=99):
|
||||
df_clean = df.copy()
|
||||
|
||||
for col in columns:
|
||||
if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']:
|
||||
if col == 'Absenteeism time in hours':
|
||||
continue
|
||||
lower = df_clean[col].quantile(lower_pct / 100)
|
||||
upper = df_clean[col].quantile(upper_pct / 100)
|
||||
df_clean[col] = df_clean[col].clip(lower, upper)
|
||||
|
||||
return df_clean
|
||||
|
||||
def feature_engineering(self, df):
|
||||
df = df.copy()
|
||||
|
||||
df['workload_per_age'] = df['Work load Average/day'] / (df['Age'] + 1)
|
||||
df['expense_per_distance'] = df['Transportation expense'] / (df['Distance from Residence to Work'] + 1)
|
||||
df['age_service_ratio'] = df['Age'] / (df['Service time'] + 1)
|
||||
|
||||
df['has_children'] = (df['Son'] > 0).astype(int)
|
||||
df['has_pet'] = (df['Pet'] > 0).astype(int)
|
||||
df['family_responsibility'] = df['Son'] + df['Pet']
|
||||
|
||||
df['health_risk'] = ((df['Social drinker'] == 1) | (df['Social smoker'] == 1) | (df['Body mass index'] > 30)).astype(int)
|
||||
df['lifestyle_risk'] = df['Social drinker'].astype(int) + df['Social smoker'].astype(int)
|
||||
|
||||
df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 100], labels=[1, 2, 3, 4])
|
||||
df['service_group'] = pd.cut(df['Service time'], bins=[0, 5, 10, 20, 100], labels=[1, 2, 3, 4])
|
||||
df['bmi_category'] = pd.cut(df['Body mass index'], bins=[0, 18.5, 25, 30, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['workload_category'] = pd.cut(df['Work load Average/day'], bins=[0, 200, 250, 300, 500], labels=[1, 2, 3, 4])
|
||||
df['commute_category'] = pd.cut(df['Distance from Residence to Work'], bins=[0, 10, 20, 50, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['seasonal_risk'] = df['Seasons'].apply(lambda x: 1 if x in [1, 3] else 0)
|
||||
df['weekday_risk'] = df['Day of the week'].apply(lambda x: 1 if x in [2, 6] else 0)
|
||||
|
||||
df['hit_target_ratio'] = df['Hit target'] / 100
|
||||
df['experience_level'] = pd.cut(df['Service time'], bins=[0, 5, 10, 15, 100], labels=[1, 2, 3, 4])
|
||||
|
||||
df['age_workload_interaction'] = df['Age'] * df['Work load Average/day'] / 10000
|
||||
df['service_bmi_interaction'] = df['Service time'] * df['Body mass index'] / 100
|
||||
|
||||
return df
|
||||
|
||||
y = df[TARGET_COLUMN]
|
||||
print('\nData Analysis')
|
||||
print(f' Samples: {len(df)}')
|
||||
print(f' Mean: {y.mean():.2f}, Median: {y.median():.2f}, Std: {y.std():.2f}')
|
||||
print(f' High risk ratio (>8h): {(y > 8).mean() * 100:.1f}%')
|
||||
|
||||
def select_features(self, X, y, k=20):
|
||||
print("\nFeature Selection...")
|
||||
|
||||
selector = SelectKBest(score_func=f_regression, k=min(k, X.shape[1]))
|
||||
selector.fit(X, y)
|
||||
|
||||
scores = selector.scores_
|
||||
feature_scores = list(zip(self.feature_names, scores))
|
||||
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print(f"\nTop {min(k, len(feature_scores))} features by F-score:")
|
||||
for i, (name, score) in enumerate(feature_scores[:min(k, len(feature_scores))]):
|
||||
cn = config.FEATURE_NAME_CN.get(name, name)
|
||||
print(f" {i+1}. {cn}: {score:.2f}")
|
||||
|
||||
selected_mask = selector.get_support()
|
||||
self.selected_features = [f for f, s in zip(self.feature_names, selected_mask) if s]
|
||||
|
||||
self.feature_selector = selector
|
||||
mask = selector.get_support()
|
||||
self.selected_features = [name for name, keep in zip(self.feature_names, mask) if keep]
|
||||
return selector.transform(X)
|
||||
|
||||
|
||||
def transform_target(self, y):
|
||||
return np.log1p(np.clip(y, a_min=0, a_max=None)) if self.target_transform == 'log1p' else y
|
||||
|
||||
def inverse_transform_target(self, y_pred):
|
||||
return np.expm1(y_pred) if self.target_transform == 'log1p' else y_pred
|
||||
|
||||
def transform_features(self, X_df):
|
||||
X_df = align_feature_frame(X_df, self.feature_names)
|
||||
X = self.scaler.transform(to_float_array(X_df))
|
||||
return self.feature_selector.transform(X) if self.feature_selector else X
|
||||
|
||||
def prepare_data(self):
|
||||
df = get_clean_data()
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
|
||||
df = df.drop(columns=['ID'])
|
||||
|
||||
cols_to_drop = ['Weight', 'Height', 'Reason for absence']
|
||||
for col in cols_to_drop:
|
||||
if col in df.columns:
|
||||
df = df.drop(columns=[col])
|
||||
print(" Removed features: Weight, Height, Reason for absence (data leakage risk)")
|
||||
|
||||
df = normalize_columns(get_clean_data())
|
||||
df = prepare_modeling_dataframe(df)
|
||||
self.analyze_data(df)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Data Preprocessing")
|
||||
print("="*60)
|
||||
|
||||
numerical_cols = ['Age', 'Service time', 'Work load Average/day',
|
||||
'Transportation expense', 'Distance from Residence to Work',
|
||||
'Hit target', 'Body mass index']
|
||||
df = self.clip_outliers(df, numerical_cols)
|
||||
print(" Outliers clipped (1st-99th percentile)")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Data Augmentation")
|
||||
print("="*60)
|
||||
|
||||
df = self.augmenter.smote_regression(df)
|
||||
df = self.augmenter.augment(df)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Feature Engineering")
|
||||
print("="*60)
|
||||
|
||||
df = self.feature_engineering(df)
|
||||
|
||||
y = df['Absenteeism time in hours'].values
|
||||
X_df = df.drop(columns=['Absenteeism time in hours'])
|
||||
|
||||
ordinal_cols = ['Month of absence', 'Day of the week', 'Seasons',
|
||||
'Disciplinary failure', 'Education', 'Social drinker',
|
||||
'Social smoker', 'age_group', 'service_group',
|
||||
'bmi_category', 'workload_category', 'commute_category',
|
||||
'experience_level']
|
||||
|
||||
for col in ordinal_cols:
|
||||
if col in X_df.columns:
|
||||
le = LabelEncoder()
|
||||
X_df[col] = le.fit_transform(X_df[col].astype(str))
|
||||
self.label_encoders[col] = le
|
||||
|
||||
self.feature_names = list(X_df.columns)
|
||||
|
||||
X = X_df.values.astype(float)
|
||||
|
||||
X = self.scaler.fit_transform(X)
|
||||
|
||||
X = self.select_features(X, y, k=20)
|
||||
|
||||
print(f"\nFinal feature count: {X.shape[1]}")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
|
||||
target_bins = make_target_bins(df[TARGET_COLUMN].values)
|
||||
train_df, test_df = train_test_split(
|
||||
df,
|
||||
test_size=config.TEST_SIZE,
|
||||
random_state=config.RANDOM_STATE,
|
||||
stratify=target_bins,
|
||||
)
|
||||
|
||||
train_df = train_df.reset_index(drop=True)
|
||||
test_df = test_df.reset_index(drop=True)
|
||||
|
||||
self.outlier_bounds = fit_outlier_bounds(train_df, NUMERICAL_OUTLIER_COLUMNS)
|
||||
train_df = apply_outlier_bounds(train_df, self.outlier_bounds)
|
||||
test_df = apply_outlier_bounds(test_df, self.outlier_bounds)
|
||||
|
||||
train_df = engineer_features(train_df)
|
||||
test_df = engineer_features(test_df)
|
||||
X_train_df, y_train = extract_xy(train_df)
|
||||
X_test_df, y_test = extract_xy(test_df)
|
||||
|
||||
X_train_df, self.label_encoders = fit_label_encoders(X_train_df, ORDINAL_COLUMNS)
|
||||
X_test_df = apply_label_encoders(X_test_df, self.label_encoders)
|
||||
|
||||
self.feature_names = list(X_train_df.columns)
|
||||
X_test_df = align_feature_frame(X_test_df, self.feature_names)
|
||||
X_train = self.scaler.fit_transform(to_float_array(X_train_df))
|
||||
X_test = self.scaler.transform(to_float_array(X_test_df))
|
||||
|
||||
transformed_target = self.transform_target(y_train)
|
||||
X_train = self.select_features(X_train, transformed_target, k=self.feature_k)
|
||||
X_test = self.transform_features(X_test_df)
|
||||
|
||||
self.training_metadata = {
|
||||
'train_samples': int(len(train_df)),
|
||||
'test_samples': int(len(test_df)),
|
||||
'feature_count_before_selection': int(len(self.feature_names)),
|
||||
'feature_count_after_selection': int(X_train.shape[1]),
|
||||
'training_date': datetime.now().strftime('%Y-%m-%d'),
|
||||
'target_transform': self.target_transform,
|
||||
'available_models': list(self.enabled_models),
|
||||
}
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def _run_search(self, name, estimator, params, X_train, y_train, n_iter=12):
|
||||
start_time = time.time()
|
||||
search = RandomizedSearchCV(
|
||||
estimator,
|
||||
param_distributions=params,
|
||||
n_iter=n_iter,
|
||||
cv=4,
|
||||
scoring='r2',
|
||||
n_jobs=-1,
|
||||
random_state=config.RANDOM_STATE,
|
||||
)
|
||||
search.fit(X_train, y_train)
|
||||
self.models[name] = search.best_estimator_
|
||||
print_training_log(name, start_time, search.best_score_, search.best_params_, n_iter, 4)
|
||||
|
||||
def train_random_forest(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Random Forest")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 15, 20, 25],
|
||||
'min_samples_split': [2, 5, 10],
|
||||
'min_samples_leaf': [1, 2, 4],
|
||||
'max_features': ['sqrt', 0.7]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
rf, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'random_forest',
|
||||
RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 14, 18, None],
|
||||
'min_samples_split': [2, 4, 8],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
'max_features': ['sqrt', 0.7],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['random_forest'] = random_search.best_estimator_
|
||||
print_training_log("Random Forest", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_xgboost(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training XGBoost")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [5, 7, 9],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'colsample_bytree': [0.7, 0.8],
|
||||
'min_child_weight': [1, 3],
|
||||
'reg_alpha': [0, 0.1],
|
||||
'reg_lambda': [1, 1.5]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
xgb_model, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['xgboost'] = random_search.best_estimator_
|
||||
print_training_log("XGBoost", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_lightgbm(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training LightGBM")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [7, 9, 11, -1],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'colsample_bytree': [0.7, 0.8],
|
||||
'min_child_samples': [5, 10, 20],
|
||||
'reg_alpha': [0, 0.1],
|
||||
'reg_lambda': [1, 1.5],
|
||||
'num_leaves': [31, 50, 70]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
lgb_model, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['lightgbm'] = random_search.best_estimator_
|
||||
print_training_log("LightGBM", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
|
||||
def train_gradient_boosting(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Gradient Boosting")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
gb = GradientBoostingRegressor(random_state=42)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300],
|
||||
'max_depth': [5, 7, 9],
|
||||
'learning_rate': [0.05, 0.1],
|
||||
'subsample': [0.7, 0.8],
|
||||
'min_samples_split': [2, 5],
|
||||
'min_samples_leaf': [1, 2]
|
||||
}
|
||||
|
||||
print(f" Searching {15*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
gb, param_distributions, n_iter=15, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'gradient_boosting',
|
||||
GradientBoostingRegressor(random_state=config.RANDOM_STATE),
|
||||
{
|
||||
'n_estimators': [160, 220, 300],
|
||||
'max_depth': [3, 4, 5],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'min_samples_split': [2, 4, 6],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['gradient_boosting'] = random_search.best_estimator_
|
||||
print_training_log("Gradient Boosting", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 15, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
|
||||
def train_extra_trees(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Extra Trees")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
et = ExtraTreesRegressor(random_state=42, n_jobs=-1)
|
||||
|
||||
param_distributions = {
|
||||
'n_estimators': [200, 300, 400],
|
||||
'max_depth': [10, 15, 20],
|
||||
'min_samples_split': [2, 5, 10],
|
||||
'min_samples_leaf': [1, 2, 4],
|
||||
'max_features': ['sqrt', 0.7]
|
||||
}
|
||||
|
||||
print(f" Searching {20*5} parameter combinations...")
|
||||
random_search = RandomizedSearchCV(
|
||||
et, param_distributions, n_iter=20, cv=5,
|
||||
scoring='r2', n_jobs=-1, random_state=42
|
||||
self._run_search(
|
||||
'extra_trees',
|
||||
ExtraTreesRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [220, 320, 420],
|
||||
'max_depth': [10, 15, 20, None],
|
||||
'min_samples_split': [2, 4, 8],
|
||||
'min_samples_leaf': [1, 2, 3],
|
||||
'max_features': ['sqrt', 0.7],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
random_search.fit(X_train, y_train)
|
||||
|
||||
self.models['extra_trees'] = random_search.best_estimator_
|
||||
print_training_log("Extra Trees", start_time, random_search.best_score_,
|
||||
random_search.best_params_, 20, 5)
|
||||
|
||||
return random_search.best_estimator_
|
||||
|
||||
def train_stacking(self, X_train, y_train):
|
||||
print("\n" + "="*60)
|
||||
print("Training Stacking Ensemble")
|
||||
print("="*60)
|
||||
|
||||
start_time = time.time()
|
||||
base_estimators = []
|
||||
|
||||
if 'random_forest' in self.models:
|
||||
base_estimators.append(('rf', self.models['random_forest']))
|
||||
if 'xgboost' in self.models:
|
||||
base_estimators.append(('xgb', self.models['xgboost']))
|
||||
if 'lightgbm' in self.models:
|
||||
base_estimators.append(('lgb', self.models['lightgbm']))
|
||||
if 'gradient_boosting' in self.models:
|
||||
base_estimators.append(('gb', self.models['gradient_boosting']))
|
||||
|
||||
if len(base_estimators) < 2:
|
||||
print(" Not enough base models for stacking")
|
||||
return None
|
||||
|
||||
print(f" Base estimators: {[name for name, _ in base_estimators]}")
|
||||
print(f" Meta learner: Ridge")
|
||||
print(f" CV folds: 5")
|
||||
|
||||
stacking = StackingRegressor(
|
||||
estimators=base_estimators,
|
||||
final_estimator=Ridge(alpha=1.0),
|
||||
cv=5,
|
||||
n_jobs=-1
|
||||
|
||||
def train_lightgbm(self, X_train, y_train):
|
||||
if lgb is None:
|
||||
return
|
||||
self._run_search(
|
||||
'lightgbm',
|
||||
lgb.LGBMRegressor(random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [7, 9, -1],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'num_leaves': [31, 50, 70],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
stacking.fit(X_train, y_train)
|
||||
|
||||
self.models['stacking'] = stacking
|
||||
elapsed = time.time() - start_time
|
||||
print(f" {'─'*50}")
|
||||
print(f" Stacking ensemble created in {elapsed:.1f}s")
|
||||
print(f" {'─'*50}")
|
||||
|
||||
return stacking
|
||||
|
||||
|
||||
def train_xgboost(self, X_train, y_train):
|
||||
if xgb is None:
|
||||
return
|
||||
self._run_search(
|
||||
'xgboost',
|
||||
xgb.XGBRegressor(random_state=config.RANDOM_STATE, n_jobs=-1),
|
||||
{
|
||||
'n_estimators': [180, 260, 340],
|
||||
'max_depth': [4, 6, 8],
|
||||
'learning_rate': [0.03, 0.05, 0.08],
|
||||
'subsample': [0.7, 0.85, 1.0],
|
||||
'colsample_bytree': [0.7, 0.85, 1.0],
|
||||
'min_child_weight': [1, 3, 5],
|
||||
},
|
||||
X_train,
|
||||
y_train,
|
||||
)
|
||||
|
||||
def evaluate_model(self, model, X_test, y_test):
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
y_pred = self.inverse_transform_target(model.predict(X_test))
|
||||
y_pred = np.clip(y_pred, a_min=0, a_max=None)
|
||||
mse = mean_squared_error(y_test, y_pred)
|
||||
rmse = np.sqrt(mse)
|
||||
mae = mean_absolute_error(y_test, y_pred)
|
||||
|
||||
return {
|
||||
'r2': round(r2, 4),
|
||||
'r2': round(r2_score(y_test, y_pred), 4),
|
||||
'mse': round(mse, 4),
|
||||
'rmse': round(rmse, 4),
|
||||
'mae': round(mae, 4)
|
||||
'rmse': round(np.sqrt(mse), 4),
|
||||
'mae': round(mean_absolute_error(y_test, y_pred), 4),
|
||||
}
|
||||
|
||||
|
||||
def save_models(self):
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
|
||||
for name, model in self.models.items():
|
||||
if model is not None:
|
||||
model_path = os.path.join(config.MODELS_DIR, f'{name}_model.pkl')
|
||||
joblib.dump(model, model_path)
|
||||
print(f" {name} saved")
|
||||
|
||||
joblib.dump(model, os.path.join(config.MODELS_DIR, f'{name}_model.pkl'))
|
||||
joblib.dump(self.scaler, config.SCALER_PATH)
|
||||
joblib.dump(self.feature_names, os.path.join(config.MODELS_DIR, 'feature_names.pkl'))
|
||||
joblib.dump(self.selected_features, os.path.join(config.MODELS_DIR, 'selected_features.pkl'))
|
||||
joblib.dump(self.label_encoders, os.path.join(config.MODELS_DIR, 'label_encoders.pkl'))
|
||||
joblib.dump(self.model_metrics, os.path.join(config.MODELS_DIR, 'model_metrics.pkl'))
|
||||
print(" Scaler and feature info saved")
|
||||
|
||||
joblib.dump(self.training_metadata, os.path.join(config.MODELS_DIR, 'training_metadata.pkl'))
|
||||
|
||||
def train_all(self):
|
||||
total_start = time.time()
|
||||
print("\n" + "="*60)
|
||||
print("Optimized Model Training Started")
|
||||
print("="*60)
|
||||
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print('\nOptimized Model Training Started')
|
||||
X_train, X_test, y_train, y_test = self.prepare_data()
|
||||
|
||||
print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Training Models with Hyperparameter Optimization")
|
||||
print("="*60)
|
||||
|
||||
self.train_random_forest(X_train, y_train)
|
||||
self.train_extra_trees(X_train, y_train)
|
||||
self.train_xgboost(X_train, y_train)
|
||||
self.train_lightgbm(X_train, y_train)
|
||||
self.train_gradient_boosting(X_train, y_train)
|
||||
self.train_stacking(X_train, y_train)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Evaluating Models on Test Set")
|
||||
print("="*60)
|
||||
|
||||
best_r2 = -float('inf')
|
||||
best_model = None
|
||||
|
||||
y_train_transformed = self.transform_target(y_train)
|
||||
|
||||
if 'random_forest' in self.enabled_models:
|
||||
self.train_random_forest(X_train, y_train_transformed)
|
||||
if 'gradient_boosting' in self.enabled_models:
|
||||
self.train_gradient_boosting(X_train, y_train_transformed)
|
||||
if 'extra_trees' in self.enabled_models:
|
||||
self.train_extra_trees(X_train, y_train_transformed)
|
||||
if 'lightgbm' in self.enabled_models:
|
||||
self.train_lightgbm(X_train, y_train_transformed)
|
||||
if 'xgboost' in self.enabled_models:
|
||||
self.train_xgboost(X_train, y_train_transformed)
|
||||
|
||||
for name, model in self.models.items():
|
||||
if model is not None:
|
||||
metrics = self.evaluate_model(model, X_test, y_test)
|
||||
self.model_metrics[name] = metrics
|
||||
|
||||
status = "Good" if metrics['r2'] > 0.5 else ("OK" if metrics['r2'] > 0.3 else "Poor")
|
||||
status_icon = "✓" if status == "Good" else ("△" if status == "OK" else "✗")
|
||||
print(f" {status_icon} {name:20s} - R2: {metrics['r2']:.4f}, RMSE: {metrics['rmse']:.4f}, MAE: {metrics['mae']:.4f}")
|
||||
|
||||
if metrics['r2'] > best_r2:
|
||||
best_r2 = metrics['r2']
|
||||
best_model = name
|
||||
|
||||
print(f"\n ★ Best Model: {best_model} (R2 = {best_r2:.4f})")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Saving Models")
|
||||
print("="*60)
|
||||
metrics = self.evaluate_model(model, X_test, y_test)
|
||||
self.model_metrics[name] = metrics
|
||||
print(f' {name:20s} R2={metrics["r2"]:.4f} RMSE={metrics["rmse"]:.4f} MAE={metrics["mae"]:.4f}')
|
||||
|
||||
self.save_models()
|
||||
|
||||
return self.model_metrics
|
||||
|
||||
|
||||
def train_and_save_models():
|
||||
total_start = time.time()
|
||||
start = time.time()
|
||||
trainer = OptimizedModelTrainer()
|
||||
metrics = trainer.train_all()
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Training Complete!")
|
||||
print("="*60)
|
||||
print(f"Total training time: {total_elapsed:.1f}s ({total_elapsed/60:.1f} min)")
|
||||
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print("\n" + "-"*60)
|
||||
print("Final Model Ranking (by R2)")
|
||||
print("-"*60)
|
||||
|
||||
sorted_metrics = sorted(metrics.items(), key=lambda x: x[1]['r2'], reverse=True)
|
||||
for i, (name, m) in enumerate(sorted_metrics, 1):
|
||||
medal = "🥇" if i == 1 else ("🥈" if i == 2 else ("🥉" if i == 3 else " "))
|
||||
print(f" {medal} {i}. {name:20s} - R2: {m['r2']:.4f}, RMSE: {m['rmse']:.4f}")
|
||||
|
||||
print(f'\nTraining Complete in {time.time() - start:.1f}s')
|
||||
for idx, (name, metric) in enumerate(sorted(metrics.items(), key=lambda item: item[1]['r2'], reverse=True), start=1):
|
||||
print(f'{idx}. {name} - R2={metric["r2"]:.4f}')
|
||||
return metrics
|
||||
|
||||
|
||||
|
||||
12001
backend/data/raw/china_enterprise_absence_events.csv
Normal file
12001
backend/data/raw/china_enterprise_absence_events.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
from core.feature_mining import get_correlation_for_heatmap, group_comparison
|
||||
@@ -10,109 +10,95 @@ class AnalysisService:
|
||||
def __init__(self):
|
||||
self.models = {}
|
||||
self.feature_names = None
|
||||
|
||||
self.selected_features = None
|
||||
self.training_metadata = {}
|
||||
|
||||
def _ensure_models_loaded(self):
|
||||
if not self.models:
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
}
|
||||
|
||||
for name, filename in model_files.items():
|
||||
model_path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(model_path):
|
||||
try:
|
||||
self.models[name] = joblib.load(model_path)
|
||||
except Exception as e:
|
||||
print(f"Failed to load {name}: {e}")
|
||||
|
||||
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
||||
if os.path.exists(feature_names_path):
|
||||
self.feature_names = joblib.load(feature_names_path)
|
||||
|
||||
if self.models:
|
||||
return
|
||||
metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
|
||||
if os.path.exists(metadata_path):
|
||||
self.training_metadata = joblib.load(metadata_path)
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
'gradient_boosting': 'gradient_boosting_model.pkl',
|
||||
}
|
||||
allowed_models = self.training_metadata.get('available_models')
|
||||
if allowed_models:
|
||||
model_files = {k: v for k, v in model_files.items() if k in allowed_models}
|
||||
for name, filename in model_files.items():
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
self.models[name] = joblib.load(path)
|
||||
except Exception as exc:
|
||||
print(f'Failed to load model {name}: {exc}')
|
||||
for filename, attr in [('feature_names.pkl', 'feature_names'), ('selected_features.pkl', 'selected_features')]:
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
setattr(self, attr, joblib.load(path))
|
||||
except Exception as exc:
|
||||
print(f'Failed to load artifact {filename}: {exc}')
|
||||
|
||||
def get_feature_importance(self, model_type='random_forest'):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
if model_type not in self.models:
|
||||
if self.models:
|
||||
model_type = list(self.models.keys())[0]
|
||||
else:
|
||||
return self._get_default_importance()
|
||||
|
||||
model_type = next(iter(self.models), 'default')
|
||||
if model_type == 'default':
|
||||
return self._get_default_importance()
|
||||
model = self.models[model_type]
|
||||
|
||||
try:
|
||||
if hasattr(model, 'feature_importances_'):
|
||||
importances = model.feature_importances_
|
||||
else:
|
||||
return self._get_default_importance()
|
||||
|
||||
feature_names = self.feature_names or [f'feature_{i}' for i in range(len(importances))]
|
||||
|
||||
if len(feature_names) != len(importances):
|
||||
feature_names = [f'feature_{i}' for i in range(len(importances))]
|
||||
|
||||
feature_importance = list(zip(feature_names, importances))
|
||||
feature_importance.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
features = []
|
||||
for i, (name, imp) in enumerate(feature_importance[:15]):
|
||||
features.append({
|
||||
if not hasattr(model, 'feature_importances_'):
|
||||
return self._get_default_importance()
|
||||
|
||||
importances = model.feature_importances_
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
if len(feature_names) != len(importances):
|
||||
feature_names = [f'feature_{idx}' for idx in range(len(importances))]
|
||||
ranked = sorted(zip(feature_names, importances), key=lambda item: item[1], reverse=True)[:15]
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'features': [
|
||||
{
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': round(float(imp), 4),
|
||||
'rank': i + 1
|
||||
})
|
||||
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'features': features
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error getting feature importance: {e}")
|
||||
return self._get_default_importance()
|
||||
|
||||
'importance': round(float(importance), 4),
|
||||
'rank': idx + 1,
|
||||
}
|
||||
for idx, (name, importance) in enumerate(ranked)
|
||||
],
|
||||
}
|
||||
|
||||
def _get_default_importance(self):
|
||||
default_features = [
|
||||
('Reason for absence', 0.25),
|
||||
('Transportation expense', 0.12),
|
||||
('Distance from Residence to Work', 0.10),
|
||||
('Service time', 0.08),
|
||||
('Age', 0.07),
|
||||
('Work load Average/day', 0.06),
|
||||
('Body mass index', 0.05),
|
||||
('Social drinker', 0.04),
|
||||
('Hit target', 0.03),
|
||||
('Son', 0.03),
|
||||
('Pet', 0.02),
|
||||
('Education', 0.02),
|
||||
('Social smoker', 0.01)
|
||||
defaults = [
|
||||
('加班通勤压力指数', 0.24),
|
||||
('健康风险指数', 0.18),
|
||||
('请假类型', 0.12),
|
||||
('通勤时长分钟', 0.1),
|
||||
('月均加班时长', 0.08),
|
||||
('近90天缺勤次数', 0.07),
|
||||
('心理压力等级', 0.06),
|
||||
('家庭负担指数', 0.05),
|
||||
]
|
||||
|
||||
features = []
|
||||
for i, (name, imp) in enumerate(default_features):
|
||||
features.append({
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': imp,
|
||||
'rank': i + 1
|
||||
})
|
||||
|
||||
return {
|
||||
'model_type': 'default',
|
||||
'features': features
|
||||
'features': [
|
||||
{
|
||||
'name': name,
|
||||
'name_cn': config.FEATURE_NAME_CN.get(name, name),
|
||||
'importance': importance,
|
||||
'rank': idx + 1,
|
||||
}
|
||||
for idx, (name, importance) in enumerate(defaults)
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def get_correlation(self):
|
||||
return get_correlation_for_heatmap()
|
||||
|
||||
|
||||
def get_group_comparison(self, dimension):
|
||||
valid_dimensions = ['drinker', 'smoker', 'education', 'children', 'pet']
|
||||
|
||||
if dimension not in valid_dimensions:
|
||||
raise ValueError(f"Invalid dimension: {dimension}. Must be one of {valid_dimensions}")
|
||||
|
||||
return group_comparison(dimension)
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ class ClusterService:
|
||||
def get_cluster_profile(self, n_clusters=3):
|
||||
return self.analyzer.get_cluster_profile(n_clusters)
|
||||
|
||||
def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
|
||||
def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'):
|
||||
return self.analyzer.get_scatter_data(n_clusters, x_axis, y_axis)
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
from core.preprocessing import get_clean_data
|
||||
|
||||
@@ -8,154 +5,103 @@ from core.preprocessing import get_clean_data
|
||||
class DataService:
|
||||
def __init__(self):
|
||||
self._df = None
|
||||
|
||||
|
||||
@property
|
||||
def df(self):
|
||||
if self._df is None:
|
||||
self._df = get_clean_data()
|
||||
return self._df
|
||||
|
||||
|
||||
def get_basic_stats(self):
|
||||
df = self.df
|
||||
|
||||
total_records = len(df)
|
||||
total_employees = df['ID'].nunique()
|
||||
total_absent_hours = df['Absenteeism time in hours'].sum()
|
||||
avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
|
||||
max_absent_hours = int(df['Absenteeism time in hours'].max())
|
||||
min_absent_hours = int(df['Absenteeism time in hours'].min())
|
||||
|
||||
high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
|
||||
high_risk_ratio = round(high_risk_count / total_records, 4)
|
||||
|
||||
total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
|
||||
avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
|
||||
max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
|
||||
min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
|
||||
high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
|
||||
return {
|
||||
'total_records': total_records,
|
||||
'total_employees': total_employees,
|
||||
'total_absent_hours': int(total_absent_hours),
|
||||
'avg_absent_hours': avg_absent_hours,
|
||||
'max_absent_hours': max_absent_hours,
|
||||
'min_absent_hours': min_absent_hours,
|
||||
'high_risk_ratio': high_risk_ratio
|
||||
'high_risk_ratio': round(high_risk_count / total_records, 4),
|
||||
'industries_covered': int(df['所属行业'].nunique()),
|
||||
}
|
||||
|
||||
|
||||
def get_monthly_trend(self):
|
||||
df = self.df
|
||||
|
||||
monthly = df.groupby('Month of absence').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
months = ['1月', '2月', '3月', '4月', '5月', '6月',
|
||||
'7月', '8月', '9月', '10月', '11月', '12月']
|
||||
|
||||
result = {
|
||||
'months': months,
|
||||
'total_hours': [],
|
||||
'avg_hours': [],
|
||||
'record_counts': []
|
||||
}
|
||||
|
||||
for i in range(1, 13):
|
||||
row = monthly[monthly['month'] == i]
|
||||
if len(row) > 0:
|
||||
result['total_hours'].append(int(row['total_hours'].values[0]))
|
||||
result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
|
||||
for month in range(1, 13):
|
||||
row = monthly[monthly['month'] == month]
|
||||
result['months'].append(f'{month}月')
|
||||
if len(row):
|
||||
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
|
||||
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
|
||||
result['record_counts'].append(int(row['record_count'].values[0]))
|
||||
else:
|
||||
result['total_hours'].append(0)
|
||||
result['avg_hours'].append(0)
|
||||
result['record_counts'].append(0)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_weekday_distribution(self):
|
||||
df = self.df
|
||||
|
||||
weekday = df.groupby('Day of the week').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
result = {
|
||||
'weekdays': [],
|
||||
'weekday_codes': [],
|
||||
'total_hours': [],
|
||||
'avg_hours': [],
|
||||
'record_counts': []
|
||||
}
|
||||
|
||||
for code in [2, 3, 4, 5, 6]:
|
||||
result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
|
||||
for code in range(1, 8):
|
||||
row = weekday[weekday['weekday'] == code]
|
||||
result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
|
||||
result['weekday_codes'].append(code)
|
||||
if len(row) > 0:
|
||||
result['total_hours'].append(int(row['total_hours'].values[0]))
|
||||
if len(row):
|
||||
result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
|
||||
result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
|
||||
result['record_counts'].append(int(row['record_count'].values[0]))
|
||||
else:
|
||||
result['total_hours'].append(0)
|
||||
result['avg_hours'].append(0)
|
||||
result['record_counts'].append(0)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_reason_distribution(self):
|
||||
df = self.df
|
||||
|
||||
reason = df.groupby('Reason for absence').agg({
|
||||
'Absenteeism time in hours': 'count'
|
||||
}).reset_index()
|
||||
|
||||
reason.columns = ['code', 'count']
|
||||
reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
|
||||
reason.columns = ['name', 'count']
|
||||
reason = reason.sort_values('count', ascending=False)
|
||||
|
||||
total = reason['count'].sum()
|
||||
|
||||
result = {
|
||||
'reasons': []
|
||||
return {
|
||||
'reasons': [
|
||||
{
|
||||
'name': row['name'],
|
||||
'count': int(row['count']),
|
||||
'percentage': round(float(row['count']) / total * 100, 1),
|
||||
}
|
||||
for _, row in reason.iterrows()
|
||||
]
|
||||
}
|
||||
|
||||
for _, row in reason.iterrows():
|
||||
code = int(row['code'])
|
||||
result['reasons'].append({
|
||||
'code': code,
|
||||
'name': config.REASON_NAMES.get(code, f'原因{code}'),
|
||||
'count': int(row['count']),
|
||||
'percentage': round(row['count'] / total * 100, 1)
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_season_distribution(self):
|
||||
df = self.df
|
||||
|
||||
season = df.groupby('Seasons').agg({
|
||||
'Absenteeism time in hours': ['sum', 'mean', 'count']
|
||||
}).reset_index()
|
||||
|
||||
season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
|
||||
season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
|
||||
|
||||
total_records = season['record_count'].sum()
|
||||
|
||||
result = {
|
||||
'seasons': []
|
||||
}
|
||||
|
||||
result = {'seasons': []}
|
||||
for code in [1, 2, 3, 4]:
|
||||
row = season[season['season'] == code]
|
||||
if len(row) > 0:
|
||||
result['seasons'].append({
|
||||
'code': int(code),
|
||||
'name': config.SEASON_NAMES.get(code, f'季节{code}'),
|
||||
'total_hours': int(row['total_hours'].values[0]),
|
||||
'avg_hours': round(float(row['avg_hours'].values[0]), 2),
|
||||
'record_count': int(row['record_count'].values[0]),
|
||||
'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
|
||||
})
|
||||
|
||||
if not len(row):
|
||||
continue
|
||||
result['seasons'].append({
|
||||
'code': code,
|
||||
'name': config.SEASON_NAMES.get(code, f'季节{code}'),
|
||||
'total_hours': round(float(row['total_hours'].values[0]), 1),
|
||||
'avg_hours': round(float(row['avg_hours'].values[0]), 2),
|
||||
'record_count': int(row['record_count'].values[0]),
|
||||
'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -1,41 +1,25 @@
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
import config
|
||||
from core.model_features import (
|
||||
align_feature_frame,
|
||||
apply_label_encoders,
|
||||
build_prediction_dataframe,
|
||||
engineer_features,
|
||||
to_float_array,
|
||||
)
|
||||
|
||||
|
||||
MODEL_INFO = {
|
||||
'random_forest': {
|
||||
'name': 'random_forest',
|
||||
'name_cn': '随机森林',
|
||||
'description': '基于决策树的集成学习算法'
|
||||
},
|
||||
'xgboost': {
|
||||
'name': 'xgboost',
|
||||
'name_cn': 'XGBoost',
|
||||
'description': '高效的梯度提升算法'
|
||||
},
|
||||
'lightgbm': {
|
||||
'name': 'lightgbm',
|
||||
'name_cn': 'LightGBM',
|
||||
'description': '微软轻量级梯度提升框架'
|
||||
},
|
||||
'gradient_boosting': {
|
||||
'name': 'gradient_boosting',
|
||||
'name_cn': 'GBDT',
|
||||
'description': '梯度提升决策树'
|
||||
},
|
||||
'extra_trees': {
|
||||
'name': 'extra_trees',
|
||||
'name_cn': '极端随机树',
|
||||
'description': '随机森林的变体,随机性更强'
|
||||
},
|
||||
'stacking': {
|
||||
'name': 'stacking',
|
||||
'name_cn': 'Stacking集成',
|
||||
'description': '多层堆叠集成学习'
|
||||
}
|
||||
'random_forest': {'name': 'random_forest', 'name_cn': '随机森林', 'description': '稳健的树模型集成'},
|
||||
'xgboost': {'name': 'xgboost', 'name_cn': 'XGBoost', 'description': '梯度提升树模型'},
|
||||
'lightgbm': {'name': 'lightgbm', 'name_cn': 'LightGBM', 'description': '轻量级梯度提升树'},
|
||||
'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': 'GBDT', 'description': '梯度提升决策树'},
|
||||
'extra_trees': {'name': 'extra_trees', 'name_cn': '极端随机树', 'description': '高随机性的树模型'},
|
||||
'stacking': {'name': 'stacking', 'name_cn': 'Stacking集成', 'description': '多模型融合'},
|
||||
}
|
||||
|
||||
|
||||
@@ -47,326 +31,172 @@ class PredictService:
|
||||
self.selected_features = None
|
||||
self.label_encoders = {}
|
||||
self.model_metrics = {}
|
||||
self.training_metadata = {}
|
||||
self.default_model = 'random_forest'
|
||||
|
||||
|
||||
def _ensure_models_loaded(self):
|
||||
if not self.models:
|
||||
self.load_models()
|
||||
|
||||
|
||||
def load_models(self):
|
||||
metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
|
||||
if os.path.exists(metadata_path):
|
||||
self.training_metadata = joblib.load(metadata_path)
|
||||
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
'gradient_boosting': 'gradient_boosting_model.pkl',
|
||||
'extra_trees': 'extra_trees_model.pkl',
|
||||
'stacking': 'stacking_model.pkl'
|
||||
'stacking': 'stacking_model.pkl',
|
||||
}
|
||||
|
||||
allowed_models = self.training_metadata.get('available_models')
|
||||
if allowed_models:
|
||||
model_files = {k: v for k, v in model_files.items() if k in allowed_models}
|
||||
|
||||
for name, filename in model_files.items():
|
||||
model_path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(model_path):
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
self.models[name] = joblib.load(model_path)
|
||||
print(f"Loaded {name} model")
|
||||
except Exception as e:
|
||||
print(f"Failed to load {name}: {e}")
|
||||
|
||||
self.models[name] = joblib.load(path)
|
||||
except Exception as exc:
|
||||
print(f'Failed to load model {name}: {exc}')
|
||||
|
||||
if os.path.exists(config.SCALER_PATH):
|
||||
self.scaler = joblib.load(config.SCALER_PATH)
|
||||
|
||||
feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
|
||||
if os.path.exists(feature_names_path):
|
||||
self.feature_names = joblib.load(feature_names_path)
|
||||
|
||||
selected_features_path = os.path.join(config.MODELS_DIR, 'selected_features.pkl')
|
||||
if os.path.exists(selected_features_path):
|
||||
self.selected_features = joblib.load(selected_features_path)
|
||||
|
||||
label_encoders_path = os.path.join(config.MODELS_DIR, 'label_encoders.pkl')
|
||||
if os.path.exists(label_encoders_path):
|
||||
self.label_encoders = joblib.load(label_encoders_path)
|
||||
|
||||
metrics_path = os.path.join(config.MODELS_DIR, 'model_metrics.pkl')
|
||||
if os.path.exists(metrics_path):
|
||||
self.model_metrics = joblib.load(metrics_path)
|
||||
|
||||
if self.model_metrics:
|
||||
valid_metrics = {k: v for k, v in self.model_metrics.items() if k in self.models}
|
||||
if valid_metrics:
|
||||
best_model = max(valid_metrics.items(), key=lambda x: x[1]['r2'])
|
||||
self.default_model = best_model[0]
|
||||
|
||||
for filename, attr in [
|
||||
('feature_names.pkl', 'feature_names'),
|
||||
('selected_features.pkl', 'selected_features'),
|
||||
('label_encoders.pkl', 'label_encoders'),
|
||||
('model_metrics.pkl', 'model_metrics'),
|
||||
]:
|
||||
path = os.path.join(config.MODELS_DIR, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
setattr(self, attr, joblib.load(path))
|
||||
except Exception as exc:
|
||||
print(f'Failed to load artifact {filename}: {exc}')
|
||||
|
||||
valid_metrics = {key: value for key, value in self.model_metrics.items() if key in self.models}
|
||||
if valid_metrics:
|
||||
self.default_model = max(valid_metrics.items(), key=lambda item: item[1]['r2'])[0]
|
||||
|
||||
def get_available_models(self):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
models = []
|
||||
for name in self.models.keys():
|
||||
info = MODEL_INFO.get(name, {
|
||||
'name': name,
|
||||
'name_cn': name,
|
||||
'description': ''
|
||||
}).copy()
|
||||
info = MODEL_INFO.get(name, {'name': name, 'name_cn': name, 'description': ''}).copy()
|
||||
info['is_available'] = True
|
||||
info['is_default'] = (name == self.default_model)
|
||||
|
||||
if name in self.model_metrics:
|
||||
info['metrics'] = self.model_metrics[name]
|
||||
else:
|
||||
info['metrics'] = {'r2': 0, 'rmse': 0, 'mae': 0}
|
||||
|
||||
info['is_default'] = name == self.default_model
|
||||
info['metrics'] = self.model_metrics.get(name, {'r2': 0, 'rmse': 0, 'mae': 0})
|
||||
models.append(info)
|
||||
|
||||
models.sort(key=lambda x: x['metrics']['r2'], reverse=True)
|
||||
|
||||
models.sort(key=lambda item: item['metrics']['r2'], reverse=True)
|
||||
return models
|
||||
|
||||
|
||||
def predict_single(self, data, model_type=None):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
if model_type is None:
|
||||
model_type = self.default_model
|
||||
|
||||
model_type = model_type or self.default_model
|
||||
if model_type not in self.models:
|
||||
available = list(self.models.keys())
|
||||
if available:
|
||||
model_type = available[0]
|
||||
else:
|
||||
fallback = next(iter(self.models), None)
|
||||
if fallback is None:
|
||||
return self._get_default_prediction(data)
|
||||
|
||||
model = self.models[model_type]
|
||||
|
||||
model_type = fallback
|
||||
if self.scaler is None or self.feature_names is None:
|
||||
return self._get_default_prediction(data)
|
||||
|
||||
|
||||
features = self._prepare_features(data)
|
||||
|
||||
try:
|
||||
predicted_hours = model.predict([features])[0]
|
||||
predicted_hours = max(0, float(predicted_hours))
|
||||
except Exception as e:
|
||||
print(f"Prediction error: {e}")
|
||||
predicted_hours = self.models[model_type].predict([features])[0]
|
||||
predicted_hours = self._inverse_transform_prediction(predicted_hours)
|
||||
predicted_hours = max(0.5, float(predicted_hours))
|
||||
except Exception:
|
||||
return self._get_default_prediction(data)
|
||||
|
||||
|
||||
risk_level, risk_label = self._get_risk_level(predicted_hours)
|
||||
|
||||
confidence = 0.85
|
||||
if model_type in self.model_metrics:
|
||||
confidence = max(0.5, self.model_metrics[model_type].get('r2', 0.85))
|
||||
|
||||
confidence = max(0.5, self.model_metrics.get(model_type, {}).get('r2', 0.82))
|
||||
return {
|
||||
'predicted_hours': round(predicted_hours, 2),
|
||||
'risk_level': risk_level,
|
||||
'risk_label': risk_label,
|
||||
'confidence': round(confidence, 2),
|
||||
'model_used': model_type,
|
||||
'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type)
|
||||
'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type),
|
||||
}
|
||||
|
||||
|
||||
def predict_compare(self, data):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
results = []
|
||||
|
||||
for name in self.models.keys():
|
||||
try:
|
||||
result = self.predict_single(data, name)
|
||||
result['model'] = name
|
||||
result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
|
||||
|
||||
if name in self.model_metrics:
|
||||
result['r2'] = self.model_metrics[name]['r2']
|
||||
else:
|
||||
result['r2'] = 0
|
||||
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"Compare error for {name}: {e}")
|
||||
|
||||
results.sort(key=lambda x: x.get('r2', 0), reverse=True)
|
||||
|
||||
result = self.predict_single(data, name)
|
||||
result['model'] = name
|
||||
result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
|
||||
result['r2'] = self.model_metrics.get(name, {}).get('r2', 0)
|
||||
results.append(result)
|
||||
results.sort(key=lambda item: item.get('r2', 0), reverse=True)
|
||||
if results:
|
||||
results[0]['recommended'] = True
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _prepare_features(self, data):
|
||||
feature_map = {
|
||||
'Reason for absence': data.get('reason_for_absence', 23),
|
||||
'Month of absence': data.get('month_of_absence', 7),
|
||||
'Day of the week': data.get('day_of_week', 3),
|
||||
'Seasons': data.get('seasons', 1),
|
||||
'Transportation expense': data.get('transportation_expense', 200),
|
||||
'Distance from Residence to Work': data.get('distance', 20),
|
||||
'Service time': data.get('service_time', 5),
|
||||
'Age': data.get('age', 30),
|
||||
'Work load Average/day': data.get('work_load', 250),
|
||||
'Hit target': data.get('hit_target', 95),
|
||||
'Disciplinary failure': data.get('disciplinary_failure', 0),
|
||||
'Education': data.get('education', 1),
|
||||
'Son': data.get('son', 0),
|
||||
'Social drinker': data.get('social_drinker', 0),
|
||||
'Social smoker': data.get('social_smoker', 0),
|
||||
'Pet': data.get('pet', 0),
|
||||
'Body mass index': data.get('bmi', 25)
|
||||
}
|
||||
|
||||
age = feature_map['Age']
|
||||
service_time = feature_map['Service time']
|
||||
work_load = feature_map['Work load Average/day']
|
||||
distance = feature_map['Distance from Residence to Work']
|
||||
expense = feature_map['Transportation expense']
|
||||
bmi = feature_map['Body mass index']
|
||||
son = feature_map['Son']
|
||||
pet = feature_map['Pet']
|
||||
social_drinker = feature_map['Social drinker']
|
||||
social_smoker = feature_map['Social smoker']
|
||||
hit_target = feature_map['Hit target']
|
||||
seasons = feature_map['Seasons']
|
||||
day_of_week = feature_map['Day of the week']
|
||||
|
||||
derived_features = {
|
||||
'workload_per_age': work_load / (age + 1),
|
||||
'expense_per_distance': expense / (distance + 1),
|
||||
'age_service_ratio': age / (service_time + 1),
|
||||
'has_children': 1 if son > 0 else 0,
|
||||
'has_pet': 1 if pet > 0 else 0,
|
||||
'family_responsibility': son + pet,
|
||||
'health_risk': 1 if (social_drinker == 1 or social_smoker == 1 or bmi > 30) else 0,
|
||||
'lifestyle_risk': int(social_drinker) + int(social_smoker),
|
||||
'age_group': 1 if age <= 30 else (2 if age <= 40 else (3 if age <= 50 else 4)),
|
||||
'service_group': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 20 else 4)),
|
||||
'bmi_category': 1 if bmi <= 18.5 else (2 if bmi <= 25 else (3 if bmi <= 30 else 4)),
|
||||
'workload_category': 1 if work_load <= 200 else (2 if work_load <= 250 else (3 if work_load <= 300 else 4)),
|
||||
'commute_category': 1 if distance <= 10 else (2 if distance <= 20 else (3 if distance <= 50 else 4)),
|
||||
'seasonal_risk': 1 if seasons in [1, 3] else 0,
|
||||
'weekday_risk': 1 if day_of_week in [2, 6] else 0,
|
||||
'hit_target_ratio': hit_target / 100,
|
||||
'experience_level': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 15 else 4)),
|
||||
'age_workload_interaction': age * work_load / 10000,
|
||||
'service_bmi_interaction': service_time * bmi / 100
|
||||
}
|
||||
|
||||
all_features = {**feature_map, **derived_features}
|
||||
|
||||
features = []
|
||||
for fname in self.feature_names:
|
||||
if fname in all_features:
|
||||
val = all_features[fname]
|
||||
|
||||
if fname in self.label_encoders:
|
||||
try:
|
||||
val = self.label_encoders[fname].transform([str(val)])[0]
|
||||
except:
|
||||
val = 0
|
||||
|
||||
features.append(float(val))
|
||||
else:
|
||||
features.append(0.0)
|
||||
|
||||
features = np.array(features).reshape(1, -1)
|
||||
features = self.scaler.transform(features)[0]
|
||||
|
||||
X_df = build_prediction_dataframe(data)
|
||||
X_df = engineer_features(X_df)
|
||||
X_df = apply_label_encoders(X_df, self.label_encoders)
|
||||
X_df = align_feature_frame(X_df, self.feature_names)
|
||||
features = self.scaler.transform(to_float_array(X_df))[0]
|
||||
if self.selected_features:
|
||||
selected_indices = []
|
||||
for sf in self.selected_features:
|
||||
if sf in self.feature_names:
|
||||
selected_indices.append(self.feature_names.index(sf))
|
||||
selected_indices = [self.feature_names.index(name) for name in self.selected_features if name in self.feature_names]
|
||||
if selected_indices:
|
||||
features = features[selected_indices]
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def _inverse_transform_prediction(self, prediction):
|
||||
if self.training_metadata.get('target_transform') == 'log1p':
|
||||
return float(np.expm1(prediction))
|
||||
return float(prediction)
|
||||
|
||||
def _get_risk_level(self, hours):
|
||||
if hours < 4:
|
||||
return 'low', '低风险'
|
||||
elif hours <= 8:
|
||||
if hours <= 8:
|
||||
return 'medium', '中风险'
|
||||
else:
|
||||
return 'high', '高风险'
|
||||
|
||||
return 'high', '高风险'
|
||||
|
||||
def _get_default_prediction(self, data):
|
||||
base_hours = 5.0
|
||||
|
||||
expense = data.get('transportation_expense', 200)
|
||||
if expense > 300:
|
||||
base_hours += 1.0
|
||||
elif expense < 150:
|
||||
base_hours = 3.8
|
||||
base_hours += min(float(data.get('monthly_overtime_hours', 24)) / 20, 3.0)
|
||||
base_hours += min(float(data.get('commute_minutes', 40)) / 50, 2.0)
|
||||
base_hours += 1.6 if int(data.get('is_night_shift', 0)) == 1 else 0
|
||||
base_hours += 1.8 if int(data.get('chronic_disease_flag', 0)) == 1 else 0
|
||||
base_hours += 0.9 if int(data.get('near_holiday_flag', 0)) == 1 else 0
|
||||
base_hours += 0.8 if int(data.get('medical_certificate_flag', 0)) == 1 else 0
|
||||
base_hours += 0.5 * int(data.get('children_count', 0))
|
||||
if data.get('leave_type') in ['病假', '工伤假', '婚假', '丧假']:
|
||||
base_hours += 2.5
|
||||
if data.get('stress_level') == '高':
|
||||
base_hours += 0.9
|
||||
if data.get('performance_level') == 'A':
|
||||
base_hours -= 0.5
|
||||
|
||||
distance = data.get('distance', 20)
|
||||
if distance > 40:
|
||||
base_hours += 1.5
|
||||
elif distance > 25:
|
||||
base_hours += 0.8
|
||||
|
||||
service_time = data.get('service_time', 5)
|
||||
if service_time < 3:
|
||||
base_hours += 0.5
|
||||
elif service_time > 15:
|
||||
base_hours -= 0.5
|
||||
|
||||
age = data.get('age', 30)
|
||||
if age > 50:
|
||||
base_hours += 0.5
|
||||
elif age < 25:
|
||||
base_hours += 0.3
|
||||
|
||||
work_load = data.get('work_load', 250)
|
||||
if work_load > 300:
|
||||
base_hours += 1.5
|
||||
elif work_load > 260:
|
||||
base_hours += 0.5
|
||||
|
||||
bmi = data.get('bmi', 25)
|
||||
if bmi > 30:
|
||||
base_hours += 0.8
|
||||
elif bmi < 20:
|
||||
base_hours += 0.3
|
||||
|
||||
if data.get('social_drinker', 0) == 1:
|
||||
base_hours += 0.8
|
||||
if data.get('social_smoker', 0) == 1:
|
||||
base_hours += 0.5
|
||||
|
||||
son = data.get('son', 0)
|
||||
if son > 0:
|
||||
base_hours += 0.3 * son
|
||||
|
||||
pet = data.get('pet', 0)
|
||||
if pet > 0:
|
||||
base_hours -= 0.1 * pet
|
||||
|
||||
hit_target = data.get('hit_target', 95)
|
||||
if hit_target < 90:
|
||||
base_hours += 0.5
|
||||
|
||||
base_hours = max(0.5, base_hours)
|
||||
|
||||
risk_level, risk_label = self._get_risk_level(base_hours)
|
||||
|
||||
return {
|
||||
'predicted_hours': round(base_hours, 2),
|
||||
'predicted_hours': round(max(0.5, base_hours), 2),
|
||||
'risk_level': risk_level,
|
||||
'risk_label': risk_label,
|
||||
'confidence': 0.75,
|
||||
'confidence': 0.72,
|
||||
'model_used': 'default',
|
||||
'model_name_cn': '默认规则'
|
||||
'model_name_cn': '默认规则',
|
||||
}
|
||||
|
||||
|
||||
def get_model_info(self):
|
||||
self._ensure_models_loaded()
|
||||
|
||||
models = self.get_available_models()
|
||||
|
||||
return {
|
||||
'models': models,
|
||||
'models': self.get_available_models(),
|
||||
'training_info': {
|
||||
'train_samples': 2884,
|
||||
'test_samples': 722,
|
||||
'feature_count': len(self.feature_names) if self.feature_names else 20,
|
||||
'training_date': '2026-03-08'
|
||||
}
|
||||
'train_samples': self.training_metadata.get('train_samples', 0),
|
||||
'test_samples': self.training_metadata.get('test_samples', 0),
|
||||
'feature_count': self.training_metadata.get('feature_count_after_selection', 0),
|
||||
'training_date': self.training_metadata.get('training_date', ''),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user