搭建完整的前后端分离架构,实现数据概览、预测分析、聚类分析等核心功能模块 详细版: feat: 初始化员工缺勤分析系统项目 - 后端:基于 Flask 搭建 RESTful API,包含数据概览、特征分析、预测模型、聚类分析四大模块 - 前端:基于 Vue.js 构建单页应用,实现 Dashboard、预测、聚类、因子分析等页面 - 模型:集成随机森林、XGBoost、LightGBM、Stacking 等多种机器学习模型 - 文档:完成需求规格说明、系统架构设计、接口设计、数据设计、UI原型设计等文档
152 lines
4.7 KiB
Python
152 lines
4.7 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
|
|
import config
|
|
from core.preprocessing import get_clean_data
|
|
|
|
|
|
def calculate_correlation():
|
|
df = get_clean_data()
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
|
|
if 'ID' in numeric_cols:
|
|
numeric_cols.remove('ID')
|
|
|
|
corr_matrix = df[numeric_cols].corr()
|
|
|
|
return corr_matrix
|
|
|
|
|
|
def get_correlation_for_heatmap():
|
|
corr_matrix = calculate_correlation()
|
|
|
|
key_features = [
|
|
'Age',
|
|
'Service time',
|
|
'Distance from Residence to Work',
|
|
'Work load Average/day ',
|
|
'Body mass index',
|
|
'Absenteeism time in hours'
|
|
]
|
|
|
|
key_features = [f for f in key_features if f in corr_matrix.columns]
|
|
|
|
sub_matrix = corr_matrix.loc[key_features, key_features]
|
|
|
|
result = {
|
|
'features': [config.FEATURE_NAME_CN.get(f, f) for f in key_features],
|
|
'matrix': sub_matrix.values.round(2).tolist()
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def calculate_feature_importance(model, feature_names):
|
|
if hasattr(model, 'feature_importances_'):
|
|
importance = model.feature_importances_
|
|
else:
|
|
raise ValueError("Model does not have feature_importances_ attribute")
|
|
|
|
importance_dict = dict(zip(feature_names, importance))
|
|
|
|
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
return sorted_importance
|
|
|
|
|
|
def get_feature_importance_from_model(model_path, feature_names):
|
|
import joblib
|
|
|
|
model = joblib.load(model_path)
|
|
return calculate_feature_importance(model, feature_names)
|
|
|
|
|
|
def group_comparison(dimension):
|
|
df = get_clean_data()
|
|
|
|
dimension_map = {
|
|
'drinker': ('Social drinker', {0: '不饮酒', 1: '饮酒'}),
|
|
'smoker': ('Social smoker', {0: '不吸烟', 1: '吸烟'}),
|
|
'education': ('Education', {1: '高中', 2: '本科', 3: '研究生', 4: '博士'}),
|
|
'children': ('Son', {0: '无子女'}, lambda x: x > 0, '有子女'),
|
|
'pet': ('Pet', {0: '无宠物'}, lambda x: x > 0, '有宠物')
|
|
}
|
|
|
|
if dimension not in dimension_map:
|
|
raise ValueError(f"Invalid dimension: {dimension}")
|
|
|
|
col, value_map = dimension_map[dimension][0], dimension_map[dimension][1]
|
|
|
|
if dimension in ['children', 'pet']:
|
|
threshold_fn = dimension_map[dimension][2]
|
|
other_label = dimension_map[dimension][3]
|
|
|
|
groups = []
|
|
for val in [0]:
|
|
group_df = df[df[col] == val]
|
|
if len(group_df) > 0:
|
|
groups.append({
|
|
'name': value_map.get(val, str(val)),
|
|
'value': val,
|
|
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
|
'count': len(group_df),
|
|
'percentage': round(len(group_df) / len(df) * 100, 1)
|
|
})
|
|
|
|
group_df = df[df[col].apply(threshold_fn)]
|
|
if len(group_df) > 0:
|
|
groups.append({
|
|
'name': other_label,
|
|
'value': 1,
|
|
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
|
'count': len(group_df),
|
|
'percentage': round(len(group_df) / len(df) * 100, 1)
|
|
})
|
|
else:
|
|
groups = []
|
|
for val in sorted(df[col].unique()):
|
|
group_df = df[df[col] == val]
|
|
if len(group_df) > 0:
|
|
groups.append({
|
|
'name': value_map.get(val, str(val)),
|
|
'value': int(val),
|
|
'avg_hours': round(group_df['Absenteeism time in hours'].mean(), 2),
|
|
'count': len(group_df),
|
|
'percentage': round(len(group_df) / len(df) * 100, 1)
|
|
})
|
|
|
|
if len(groups) >= 2:
|
|
diff_value = abs(groups[0]['avg_hours'] - groups[1]['avg_hours'])
|
|
base = min(groups[0]['avg_hours'], groups[1]['avg_hours'])
|
|
diff_percentage = round(diff_value / base * 100, 1) if base > 0 else 0
|
|
else:
|
|
diff_value = 0
|
|
diff_percentage = 0
|
|
|
|
return {
|
|
'dimension': dimension,
|
|
'dimension_name': {
|
|
'drinker': '饮酒习惯',
|
|
'smoker': '吸烟习惯',
|
|
'education': '学历',
|
|
'children': '子女',
|
|
'pet': '宠物'
|
|
}.get(dimension, dimension),
|
|
'groups': groups,
|
|
'difference': {
|
|
'value': diff_value,
|
|
'percentage': diff_percentage
|
|
}
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print("Correlation matrix:")
|
|
corr = get_correlation_for_heatmap()
|
|
print(corr)
|
|
|
|
print("\nGroup comparison (drinker):")
|
|
comp = group_comparison('drinker')
|
|
print(comp)
|